diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a6295018eb6294856e6afd9133e0acafe3a28477
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,23 @@
+name: CI
+on:
+    push:
+        branches:
+        - master
+    pull_request:
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    flake8:
+        name: Flake8
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x'
+        -   name: "Main Script"
+            run: |
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 710ad6471854e51faa2d2602f917a05e1ea77501..fc5d32166bd9f6889f29ce79b984a075600cc4b3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -53,3 +53,12 @@ Documentation:
   - python3
   # needs CUDA headers to compile
   - nvidia-titan-x
+
+Flake8:
+  script:
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py
+  tags:
+  - python3
+  except:
+  - tags
diff --git a/examples/demo.py b/examples/demo.py
index 2bd78b4ed8f7dc56f3f04c52b86bf4e19202518f..a1c404209e929a2c3126efeaf21d9efc48ca7bdb 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,6 +1,4 @@
 # Sample source code from the Tutorial Introduction in the documentation.
-from __future__ import print_function
-from __future__ import absolute_import
 import pycuda.driver as cuda
 import pycuda.autoinit  # noqa
 from pycuda.compiler import SourceModule
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index 0fcbb5fee20157cc02c33762d3347467584af574..9baadc13a307d04d1c27b4c18acff5b50d0dbe65 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import pycuda.gpuarray as gpuarray
 import pycuda.autoinit
 import numpy
diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
index efb4d4c1094fea9eadb8d89c669e1f01db47f38e..ddd89f5afafc891b75649854bb5bc2f72b5e00d4 100644
--- a/examples/demo_meta_codepy.py
+++ b/examples/demo_meta_codepy.py
@@ -1,10 +1,8 @@
-from __future__ import absolute_import
 import pycuda.driver as cuda
 import pycuda.autoinit
 import numpy
 import numpy.linalg as la
 from pycuda.compiler import SourceModule
-from six.moves import range
 
 thread_strides = 16
 block_size = 256
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
index bac4e0219d3b50c649c46e23595f4c3b540316ad..4093434d81c1e66b31d99ec62944d0352c64b09d 100644
--- a/examples/demo_meta_template.py
+++ b/examples/demo_meta_template.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import pycuda.driver as cuda
 import pycuda.autoinit
 import numpy
diff --git a/examples/demo_struct.py b/examples/demo_struct.py
index 5591ffb70c6e632d0aeb7a4e4c7ad815b5301ec9..3f230264a58747f2f2f5b3e37da854398d09cc13 100644
--- a/examples/demo_struct.py
+++ b/examples/demo_struct.py
@@ -1,6 +1,4 @@
 # prepared invocations and structures -----------------------------------------
-from __future__ import print_function
-from __future__ import absolute_import
 import pycuda.driver as cuda
 import pycuda.autoinit
 import numpy
diff --git a/examples/dump_properties.py b/examples/dump_properties.py
index 65a684bba2b7d5978d05c6e4c40af60936a62502..a960619cf706ca1a7e912636238bfedc90d57384 100644
--- a/examples/dump_properties.py
+++ b/examples/dump_properties.py
@@ -1,7 +1,4 @@
-from __future__ import print_function
-from __future__ import absolute_import
 import pycuda.driver as drv
-from six.moves import range
 
 
 
@@ -18,5 +15,5 @@ for ordinal in range(drv.Device.count()):
     atts.sort()
     
     for att, value in atts:
-        print("  %s: %s" % (att, value))
+        print(f"  {att}: {value}")
 
diff --git a/examples/fill_gpu_with_nans.py b/examples/fill_gpu_with_nans.py
index 3a8e949799acfc80d360c66d4c3e982802bf7c99..d9306d84fc94324deb03c4b89b98fe16713737a1 100644
--- a/examples/fill_gpu_with_nans.py
+++ b/examples/fill_gpu_with_nans.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
 import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
 import pycuda.driver as cuda
diff --git a/examples/from-wiki/2d_fft.py b/examples/from-wiki/2d_fft.py
index 0efb82c140488e01b16835da6c2babf93df9c8bc..424984a452eefea5fd414312c8aefeebda9464f0 100644
--- a/examples/from-wiki/2d_fft.py
+++ b/examples/from-wiki/2d_fft.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import numpy
 import scipy.misc
diff --git a/examples/from-wiki/2dfft.py b/examples/from-wiki/2dfft.py
index cd2143bac79219ba7608b5442b9b41dc474751a2..67585107019452911a3233ecaecd49d8ba7492b5 100644
--- a/examples/from-wiki/2dfft.py
+++ b/examples/from-wiki/2dfft.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Paste code for your example here.
 
diff --git a/examples/from-wiki/arithmetic_example.py b/examples/from-wiki/arithmetic_example.py
index bf0ed190713e80fa2a853a384f8305ca664760e5..35dd9d3eeae60c262acf2bb39cf7493afd987c66 100644
--- a/examples/from-wiki/arithmetic_example.py
+++ b/examples/from-wiki/arithmetic_example.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #!python
 import pycuda.driver as cuda
diff --git a/examples/from-wiki/c++_function_templates.py b/examples/from-wiki/c++_function_templates.py
index 7a50ba348ae39fe2f65a0c0f9068317befce4f8b..b3ef4f8196c39e00416bc97ef33e2661eb26ae19 100644
--- a/examples/from-wiki/c++_function_templates.py
+++ b/examples/from-wiki/c++_function_templates.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
diff --git a/examples/from-wiki/computing.py b/examples/from-wiki/computing.py
index cd2143bac79219ba7608b5442b9b41dc474751a2..67585107019452911a3233ecaecd49d8ba7492b5 100644
--- a/examples/from-wiki/computing.py
+++ b/examples/from-wiki/computing.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Paste code for your example here.
 
diff --git a/examples/from-wiki/convolution.py b/examples/from-wiki/convolution.py
index 9502daf3efc9b1b258f49838a9209ac398f01656..12246a09f551fd8d443b27dab8dcbfdef33d6c0b 100644
--- a/examples/from-wiki/convolution.py
+++ b/examples/from-wiki/convolution.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 '''
 /*
diff --git a/examples/from-wiki/demo_complex.py b/examples/from-wiki/demo_complex.py
index 53263e37827e26bd1ab4f95c0fa571e29182f978..031221ee6449342afebe850ca8727ff7ef654844 100644
--- a/examples/from-wiki/demo_complex.py
+++ b/examples/from-wiki/demo_complex.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.driver as cuda
 import pycuda.gpuarray as gpuarray
diff --git a/examples/from-wiki/demo_meta_cgen.py b/examples/from-wiki/demo_meta_cgen.py
index f914b41290a76d50be1ec92b135359e176ba790a..51e1d737da1f0d46e935464e735146aa0fb6fef6 100644
--- a/examples/from-wiki/demo_meta_cgen.py
+++ b/examples/from-wiki/demo_meta_cgen.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.driver as cuda
 import pycuda.autoinit
diff --git a/examples/from-wiki/demo_meta_matrixmul_cheetah.py b/examples/from-wiki/demo_meta_matrixmul_cheetah.py
index a3cf4b9c8c06dd7b856f3bfb9aa006b44e65a039..9eb6531e0eba66e12ca3aa4ad3dc567c9f3cda19 100644
--- a/examples/from-wiki/demo_meta_matrixmul_cheetah.py
+++ b/examples/from-wiki/demo_meta_matrixmul_cheetah.py
@@ -1,10 +1,7 @@
-
-
 #!python 
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from __future__ import division
 
 """ 
 PyCuda Optimized Matrix Multiplication 
diff --git a/examples/from-wiki/distance_element_wise3d.py b/examples/from-wiki/distance_element_wise3d.py
index 9a5fa74fa51f83fe2684720363c20cc5e8fc9ef5..1a5138394b0a4f1e0d47548e8f1bdb96ab95ac11 100644
--- a/examples/from-wiki/distance_element_wise3d.py
+++ b/examples/from-wiki/distance_element_wise3d.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.gpuarray as gpuarray
 import pycuda.driver as cuda
diff --git a/examples/from-wiki/game_of_life.py b/examples/from-wiki/game_of_life.py
index 16958bf2ae0463893717455ade17e5f1d11d423a..eb9427a7f85a5b9ac1a121a28671ab0a63cf3fba 100644
--- a/examples/from-wiki/game_of_life.py
+++ b/examples/from-wiki/game_of_life.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Conway's Game of Life Accelerated with PyCUDA
 # Luis Villasenor
diff --git a/examples/from-wiki/gl_interop.py b/examples/from-wiki/gl_interop.py
index 1f7f2f81bf543ec0d2c9f8bc6e6eb9c3126195d9..505a71662ac251e6488b9011b94ce11d5710bd1e 100644
--- a/examples/from-wiki/gl_interop.py
+++ b/examples/from-wiki/gl_interop.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # GL interoperability example, by Peter Berrington.
 # Draws a rotating teapot, using cuda to invert the RGB value
diff --git a/examples/from-wiki/gpu_scalar_mult.py b/examples/from-wiki/gpu_scalar_mult.py
index 669d94691333badc592a2cc15993c89a1fe6baa2..e3c1742122c6ff02e12ce800acd09adf55936792 100644
--- a/examples/from-wiki/gpu_scalar_mult.py
+++ b/examples/from-wiki/gpu_scalar_mult.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import numpy
 import pycuda.autoinit
diff --git a/examples/from-wiki/kernel_concurrency.py b/examples/from-wiki/kernel_concurrency.py
index a48e642b0ed7cdbf3cc62b26839532b485ce7d04..c561980a9d553111d8c5e4e7b4db52a5cdec166e 100644
--- a/examples/from-wiki/kernel_concurrency.py
+++ b/examples/from-wiki/kernel_concurrency.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #! /usr/bin/env python
 # A simple program to illustrate kernel concurrency with PyCuda.
@@ -52,7 +50,7 @@ stream, event = [], []
 marker_names = ['kernel_begin', 'kernel_end']
 for k in range(n):
     stream.append(drv.Stream())
-    event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))]))
+    event.append({marker_names[l]: drv.Event() for l in range(len(marker_names))})
 
 # Transfer to device.
 for k in range(n):
diff --git a/examples/from-wiki/light_field_3d_viewer.py b/examples/from-wiki/light_field_3d_viewer.py
index 1129d2ef8aa09dad3cc35550d35d35d83de760c0..3f22e005a43c983631444babc97e8fbb35b0b0a3 100644
--- a/examples/from-wiki/light_field_3d_viewer.py
+++ b/examples/from-wiki/light_field_3d_viewer.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 """
 3D display of Light Field images.
@@ -18,7 +16,6 @@ Prerequisites:
 Author: Amit Aides. amitibo at technion . ac . il
 """
 
-from __future__ import division
 
 from enthought.traits.api import HasTraits, Range, on_trait_change
 from enthought.traits.ui.api import View, Item
@@ -110,7 +107,7 @@ class LFapplication(HasTraits):
         )
 
     def __init__(self, img_path):
-        super(LFapplication, self).__init__()
+        super().__init__()
 
         #
         # Load image data
@@ -119,12 +116,12 @@ class LFapplication(HasTraits):
         lenslet_path = base_path + '-lenslet.txt'
         optics_path = base_path + '-optics.txt'
 
-        with open(lenslet_path, 'r') as f:
+        with open(lenslet_path) as f:
             tmp = eval(f.readline())
             x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
               np.array(tmp, dtype=np.float32)
 
-        with open(optics_path, 'r') as f:
+        with open(optics_path) as f:
             for line in f:
                 name, val = line.strip().split()
                 try:
diff --git a/examples/from-wiki/mandelbrot.py b/examples/from-wiki/mandelbrot.py
index 21873dd1615b38cf5b547d000a56b6aca9699007..1d40e3378263a7404a3f1f9cee6260d3a6329980 100644
--- a/examples/from-wiki/mandelbrot.py
+++ b/examples/from-wiki/mandelbrot.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Mandelbrot calculate using GPU, Serial numpy and faster numpy
 # Use to show the speed difference between CPU and GPU calculations
diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py
index 15d9e801960306cfe64d0dabf13b53e73b82ef02..e434f1be89cf74aad8b5ec6fa04c65f6927571f8 100644
--- a/examples/from-wiki/mandelbrot_interactive.py
+++ b/examples/from-wiki/mandelbrot_interactive.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Interactive Mandelbrot Set Accelerated using PyCUDA
 # Classical Iteration Method
@@ -108,21 +106,21 @@ def zoom_on_square(eclick, erelease):
 def key_selector(event):
     global N,side,x0,y0,myobj,M,power,L,i_cmap,n_grid
     #print(' Key pressed.')
-    if event.key == u'up':  # Increase max number of iterations
+    if event.key == 'up':  # Increase max number of iterations
         L=int(L*1.2);
         print("Maximum number of iterations changed to %d" % L)
         func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
         myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
         ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
         plt.draw()
-    if event.key == u'down':  # Decrease max number of iterations
+    if event.key == 'down':  # Decrease max number of iterations
         L=int(L/1.2);
         print("Maximum number of iterations changed to %d" % L)
         func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
         myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
         ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
         plt.draw()
-    if event.key == u'right':  # Increase  number of pixels
+    if event.key == 'right':  # Increase  number of pixels
         N=int(N*1.2);
         n_grid=int(N/16.);
         N=n_block*n_grid;
@@ -132,7 +130,7 @@ def key_selector(event):
         myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
         ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
         plt.draw()
-    if event.key == u'left':  # Decrease  number of pixels
+    if event.key == 'left':  # Decrease  number of pixels
         N=int(N/1.2);
         n_grid=int(N/16.);
         N=n_block*n_grid;
diff --git a/examples/from-wiki/manhattan_distance_for_2D_array.py b/examples/from-wiki/manhattan_distance_for_2D_array.py
index e3201b8a40a6988176762dc9ac4b4ef33f26b2d9..3d7c837193832c78d0d2fb5551e242ca008411a0 100644
--- a/examples/from-wiki/manhattan_distance_for_2D_array.py
+++ b/examples/from-wiki/manhattan_distance_for_2D_array.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 
 import numpy
diff --git a/examples/from-wiki/matrix_transpose.py b/examples/from-wiki/matrix_transpose.py
index 2075fc7cc3db42c368cdffbe02118ab35947325d..d8ad7848e991c6b7fc55def5af174941bb8e16a7 100644
--- a/examples/from-wiki/matrix_transpose.py
+++ b/examples/from-wiki/matrix_transpose.py
@@ -1,12 +1,9 @@
-
-
 #!python 
 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
 
 # Transposition of a matrix
 # by Hendrik Riedmann <riedmann@dam.brown.edu>
 
-from __future__ import division, print_function
 
 import pycuda.driver as cuda
 import pycuda.gpuarray as gpuarray
diff --git a/examples/from-wiki/matrixmul_simple.py b/examples/from-wiki/matrixmul_simple.py
index 7686cbfddc0a3ca82d9bc53f2e061fe48d6f15bc..5eda5648cd24e5c3716609ea53cad81eb5c8b327 100644
--- a/examples/from-wiki/matrixmul_simple.py
+++ b/examples/from-wiki/matrixmul_simple.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
diff --git a/examples/from-wiki/matrixmul_tiled.py b/examples/from-wiki/matrixmul_tiled.py
index 072011ea5430c2509593bb7e253736d57b606f92..65dd7b0307b6809add4dd57c94629c8103592018 100644
--- a/examples/from-wiki/matrixmul_tiled.py
+++ b/examples/from-wiki/matrixmul_tiled.py
@@ -1,10 +1,7 @@
-
-
 #!python 
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from __future__ import division
 
 """ 
 Multiples two square matrices together using multiple blocks and shared memory. 
diff --git a/examples/from-wiki/measure_gpuarray_speed_random.py b/examples/from-wiki/measure_gpuarray_speed_random.py
index b0d87ec40c44747eff7dee5ab2dfeca884bf2e72..8f822af0ab35bfb4865e83450203e15d9490e8a3 100644
--- a/examples/from-wiki/measure_gpuarray_speed_random.py
+++ b/examples/from-wiki/measure_gpuarray_speed_random.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #! /usr/bin/env python
 # DO NOT USE THIS AS A BENCHMARK. See
diff --git a/examples/from-wiki/multiple_threads.py b/examples/from-wiki/multiple_threads.py
index 73068170c9f13290025176e5197c1e58c1cd4d5d..0a5a7738a2f77b04e8bf397a8257094392e6d262 100644
--- a/examples/from-wiki/multiple_threads.py
+++ b/examples/from-wiki/multiple_threads.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # Derived from a test case by Chris Heuser
 # Also see FAQ about PyCUDA and threads.
diff --git a/examples/from-wiki/plot_random_data.py b/examples/from-wiki/plot_random_data.py
index d3fb1a10a4a8bd8109c739ccb523de20df70ebe6..2e0118c3fe59e78d53ca77d85abcaf89ba1e2495 100644
--- a/examples/from-wiki/plot_random_data.py
+++ b/examples/from-wiki/plot_random_data.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # simple module to show the plotting of random data
 
diff --git a/examples/from-wiki/rotate.py b/examples/from-wiki/rotate.py
index db7307c9f7f352e74581c411b6d285f84bebc006..a94ec5aa41206456ed8e55df04e0053518227fa1 100644
--- a/examples/from-wiki/rotate.py
+++ b/examples/from-wiki/rotate.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #!/usr/bin/env python -tt
 # encoding: utf-8
diff --git a/examples/from-wiki/select_to_list.py b/examples/from-wiki/select_to_list.py
index d62371471fabc1c3d18299eb372747b956cdb6c5..13c310727530efbd43b31fed63f63bbec555471f 100644
--- a/examples/from-wiki/select_to_list.py
+++ b/examples/from-wiki/select_to_list.py
@@ -1,12 +1,9 @@
-
-
 #!python 
 # Exercise 2 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
 
 # Generate an array of random numbers between 0 and 1
 # List the indices of those numbers that are greater than a given limit
 
-from __future__ import division
 import pycuda.driver as cuda
 import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
diff --git a/examples/from-wiki/simple_rgb2gray.py b/examples/from-wiki/simple_rgb2gray.py
index c90045ebb158b38066fe1b0db431bd1d96de341f..9c648f4aecccf50f306c7250d7e9b6713ef458a0 100644
--- a/examples/from-wiki/simple_rgb2gray.py
+++ b/examples/from-wiki/simple_rgb2gray.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 
 __author__ = 'ashwin'
diff --git a/examples/from-wiki/simple_speed_test.py b/examples/from-wiki/simple_speed_test.py
index 1d6588f92d39fef5a8965234984fbed0668ea000..fccc57c8e72786e4d34ee7ef8b7f0593d970bf30 100644
--- a/examples/from-wiki/simple_speed_test.py
+++ b/examples/from-wiki/simple_speed_test.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 # SimpleSpeedTest.py
 
diff --git a/examples/from-wiki/sobel_filter.py b/examples/from-wiki/sobel_filter.py
index 2170da9056101bf38234eeb89e298d8a1b40c774..a3dd9f5b610e88a89e9636d55853b9bc9e7b3eb6 100644
--- a/examples/from-wiki/sobel_filter.py
+++ b/examples/from-wiki/sobel_filter.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 #!/usr/bin/env python
 #-*- coding: utf-8 -*-
diff --git a/examples/from-wiki/sparse_solve.py b/examples/from-wiki/sparse_solve.py
index 8d057c322da51300507b3ea4bf7afaa1b617983a..359414bc16fb4c7d23fb5707e47cd7499c7be2c9 100644
--- a/examples/from-wiki/sparse_solve.py
+++ b/examples/from-wiki/sparse_solve.py
@@ -1,7 +1,4 @@
-
-
 #!python 
-from __future__ import division
 import pycuda.autoinit
 import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
diff --git a/examples/from-wiki/threads_and_blocks.py b/examples/from-wiki/threads_and_blocks.py
index 3b0b2c217faba7d458203a22c7c6bc3fb8ed2b4d..caa851f30a3424f42ad0b99da77abcb0e926d812 100644
--- a/examples/from-wiki/threads_and_blocks.py
+++ b/examples/from-wiki/threads_and_blocks.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.driver as cuda
 import pycuda.autoinit
diff --git a/examples/from-wiki/thrust_interop.py b/examples/from-wiki/thrust_interop.py
index a9868f65b6048d7c50b0bdbf17c9114acf9d4bab..f408e029b3d1f3927731ba8e0df5fa35fb51aff7 100644
--- a/examples/from-wiki/thrust_interop.py
+++ b/examples/from-wiki/thrust_interop.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 
 import pycuda
diff --git a/examples/from-wiki/using_printf.py b/examples/from-wiki/using_printf.py
index 160a156c917dfd7c0178c407883476aa182696f2..c12cfa42717c16c193af9989995c15e9d1f78674 100644
--- a/examples/from-wiki/using_printf.py
+++ b/examples/from-wiki/using_printf.py
@@ -1,5 +1,3 @@
-
-
 #!python 
 import pycuda.driver as cuda
 import pycuda.autoinit
diff --git a/examples/hello_gpu.py b/examples/hello_gpu.py
index 1c8fd04ed1b65ea9f2fef25b509eef9085355d63..23c51adba399ebf5b7b2ccecd2c295df17f496a4 100644
--- a/examples/hello_gpu.py
+++ b/examples/hello_gpu.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
 import pycuda.driver as drv
 import pycuda.tools
 import pycuda.autoinit
diff --git a/pycuda/_cluda.py b/pycuda/_cluda.py
index 8cddd4d657504425a2d1401877e7809d797b6cee..fa5eda2a94fc849eec0870b312ee94eab4851e76 100644
--- a/pycuda/_cluda.py
+++ b/pycuda/_cluda.py
@@ -24,7 +24,3 @@ CLUDA_PREAMBLE = """
 #define GDIM_1 gridDim.y
 #define GDIM_2 gridDim.z
 """
-
-
-
-
diff --git a/pycuda/_mymako.py b/pycuda/_mymako.py
index cfd3736026391e19419d2f8fba795ccaa8c02105..826dde6198fd5c48746c312ed313091e78845633 100644
--- a/pycuda/_mymako.py
+++ b/pycuda/_mymako.py
@@ -1,15 +1,16 @@
-from __future__ import absolute_import
 try:
-    import mako.template
+    import mako.template  # noqa: F401
 except ImportError:
     raise ImportError(
-            "Some of PyCUDA's facilities require the Mako templating engine.\n"
-            "You or a piece of software you have used has tried to call such a\n"
-            "part of PyCUDA, but there was a problem importing Mako.\n\n"
-            "You may install mako now by typing one of:\n"
-            "- easy_install Mako\n"
-            "- pip install Mako\n"
-            "- aptitude install python-mako\n"
-            "\nor whatever else is appropriate for your system.")
+        "Some of PyCUDA's facilities require the Mako templating engine.\n"
+        "You or a piece of software you have used has tried to call such a\n"
+        "part of PyCUDA, but there was a problem importing Mako.\n\n"
+        "You may install mako now by typing one of:\n"
+        "- easy_install Mako\n"
+        "- pip install Mako\n"
+        "- aptitude install python-mako\n"
+        "\nor whatever else is appropriate for your system."
+    )
 
-from mako import *
+
+from mako import *  # noqa: F403, F401
diff --git a/pycuda/autoinit.py b/pycuda/autoinit.py
index f52e1e381b9d9aa8e6678e125afe9b61061edc07..664ed1c8e1725c46c2adcb17f5f15bcfc5ffe7eb 100644
--- a/pycuda/autoinit.py
+++ b/pycuda/autoinit.py
@@ -1,21 +1,24 @@
-from __future__ import absolute_import
 import pycuda.driver as cuda
+import atexit
 
 # Initialize CUDA
 cuda.init()
 
-from pycuda.tools import make_default_context
+from pycuda.tools import make_default_context  # noqa: E402
+
 global context
 context = make_default_context()
 device = context.get_device()
 
+
 def _finish_up():
     global context
     context.pop()
     context = None
 
     from pycuda.tools import clear_context_caches
+
     clear_context_caches()
 
-import atexit
+
 atexit.register(_finish_up)
diff --git a/pycuda/characterize.py b/pycuda/characterize.py
index 2206e5880cdce1e4f3c286de5ad4d80f8d391861..3b46ff7aeac1a2e169968258b4e3e1a27351ced8 100644
--- a/pycuda/characterize.py
+++ b/pycuda/characterize.py
@@ -1,13 +1,11 @@
-from __future__ import division
-from __future__ import absolute_import
-
 from pycuda.tools import context_dependent_memoize
 import numpy as np
 
 
 def platform_bits():
     import sys
-    if sys.maxsize > 2**32:
+
+    if sys.maxsize > 2 ** 32:
         return 64
     else:
         return 32
@@ -15,27 +13,35 @@ def platform_bits():
 
 def has_stack():
     from pycuda.driver import Context
+
     return Context.get_device().compute_capability() >= (2, 0)
 
 
 def has_double_support():
     from pycuda.driver import Context
+
     return Context.get_device().compute_capability() >= (1, 3)
 
 
 @context_dependent_memoize
 def sizeof(type_name, preamble=""):
     from pycuda.compiler import SourceModule
-    mod = SourceModule("""
+
+    mod = SourceModule(
+        """
     %s
     extern "C"
     __global__ void write_size(size_t *output)
     {
       *output = sizeof(%s);
     }
-    """ % (preamble, type_name), no_extern_c=True)
+    """
+        % (preamble, type_name),
+        no_extern_c=True,
+    )
 
     import pycuda.gpuarray as gpuarray
+
     output = gpuarray.empty((), dtype=np.uintp)
     mod.get_function("write_size")(output, block=(1, 1, 1), grid=(1, 1))
 
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index b73793b6dee779f268cab3f80929f30012c3c3f5..065a6314715fb81de49cb74f39d47a3fc4b4b61b 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -1,13 +1,13 @@
-from __future__ import absolute_import
-from __future__ import print_function
 from pytools import memoize
+
 # don't import pycuda.driver here--you'll create an import loop
+import os
+
 import sys
 from tempfile import mkstemp
 from os import unlink
 
 from pytools.prefork import call_capture_output
-from six.moves import map
 
 
 @memoize
@@ -17,6 +17,7 @@ def get_nvcc_version(nvcc):
 
     if result != 0 or not stdout:
         from warnings import warn
+
         warn("NVCC version could not be determined.")
         stdout = "nvcc unknown version"
 
@@ -26,40 +27,48 @@ def get_nvcc_version(nvcc):
 def _new_md5():
     try:
         import hashlib
+
         return hashlib.md5()
     except ImportError:
         # for Python << 2.5
         import md5
+
         return md5.new()
 
 
 def preprocess_source(source, options, nvcc):
-    handle, source_path = mkstemp(suffix='.cu')
+    handle, source_path = mkstemp(suffix=".cu")
 
-    outf = open(source_path, 'w')
+    outf = open(source_path, "w")
     outf.write(source)
     outf.close()
     os.close(handle)
 
-    cmdline = [nvcc, '--preprocess'] + options + [source_path]
-    if 'win32' in sys.platform:
-        cmdline.extend(['--compiler-options', '-EP'])
+    cmdline = [nvcc, "--preprocess"] + options + [source_path]
+    if "win32" in sys.platform:
+        cmdline.extend(["--compiler-options", "-EP"])
     else:
-        cmdline.extend(['--compiler-options', '-P'])
+        cmdline.extend(["--compiler-options", "-P"])
 
     result, stdout, stderr = call_capture_output(cmdline, error_on_nonzero=False)
 
     if result != 0:
         from pycuda.driver import CompileError
-        raise CompileError("nvcc preprocessing of %s failed" % source_path,
-                           cmdline, stderr=stderr)
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed" % source_path, cmdline, stderr=stderr
+        )
 
     # sanity check
-    if len(stdout) < 0.5*len(source):
+    if len(stdout) < 0.5 * len(source):
         from pycuda.driver import CompileError
-        raise CompileError("nvcc preprocessing of %s failed with ridiculously "
-                "small code output - likely unsupported compiler." % source_path,
-                cmdline, stderr=stderr.decode("utf-8", "replace"))
+
+        raise CompileError(
+            "nvcc preprocessing of %s failed with ridiculously "
+            "small code output - likely unsupported compiler." % source_path,
+            cmdline,
+            stderr=stderr.decode("utf-8", "replace"),
+        )
 
     unlink(source_path)
 
@@ -74,7 +83,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
     if cache_dir:
         checksum = _new_md5()
 
-        if '#include' in source:
+        if "#include" in source:
             checksum.update(preprocess_source(source, options, nvcc).encode("utf-8"))
         else:
             checksum.update(source.encode("utf-8"))
@@ -83,6 +92,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
             checksum.update(option.encode("utf-8"))
         checksum.update(get_nvcc_version(nvcc).encode("utf-8"))
         from pycuda.characterize import platform_bits
+
         checksum.update(str(platform_bits()).encode("utf-8"))
 
         cache_file = checksum.hexdigest()
@@ -99,6 +109,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
             pass
 
     from tempfile import mkdtemp
+
     file_dir = mkdtemp()
     file_root = "kernel"
 
@@ -116,12 +127,13 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
         print("*** compiler output in %s" % file_dir)
 
     cmdline = [nvcc, "--" + target] + options + [cu_file_name]
-    result, stdout, stderr = call_capture_output(cmdline,
-            cwd=file_dir, error_on_nonzero=False)
+    result, stdout, stderr = call_capture_output(
+        cmdline, cwd=file_dir, error_on_nonzero=False
+    )
 
     try:
         result_f = open(join(file_dir, file_root + "." + target), "rb")
-    except IOError:
+    except OSError:
         no_output = True
     else:
         no_output = False
@@ -129,22 +141,35 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
     if result != 0 or (no_output and (stdout or stderr)):
         if result == 0:
             from warnings import warn
-            warn("PyCUDA: nvcc exited with status 0, but appears to have "
-                    "encountered an error")
+
+            warn(
+                "PyCUDA: nvcc exited with status 0, but appears to have "
+                "encountered an error"
+            )
         from pycuda.driver import CompileError
-        raise CompileError("nvcc compilation of %s failed" % cu_file_path,
-                cmdline, stdout=stdout.decode("utf-8", "replace"),
-                stderr=stderr.decode("utf-8", "replace"))
+
+        raise CompileError(
+            "nvcc compilation of %s failed" % cu_file_path,
+            cmdline,
+            stdout=stdout.decode("utf-8", "replace"),
+            stderr=stderr.decode("utf-8", "replace"),
+        )
 
     if stdout or stderr:
-        lcase_err_text = (stdout+stderr).decode("utf-8", "replace").lower()
+        lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower()
         from warnings import warn
+
         if "demoted" in lcase_err_text or "demoting" in lcase_err_text:
-            warn("nvcc said it demoted types in source code it "
+            warn(
+                "nvcc said it demoted types in source code it "
                 "compiled--this is likely not what you want.",
-                stacklevel=4)
-        warn("The CUDA compiler succeeded, but said the following:\n"
-                + (stdout+stderr).decode("utf-8", "replace"), stacklevel=4)
+                stacklevel=4,
+            )
+        warn(
+            "The CUDA compiler succeeded, but said the following:\n"
+            + (stdout + stderr).decode("utf-8", "replace"),
+            stacklevel=4,
+        )
 
     result_data = result_f.read()
     result_f.close()
@@ -156,6 +181,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
 
     if not keep:
         from os import listdir, unlink, rmdir
+
         for name in listdir(file_dir):
             unlink(join(file_dir, name))
         rmdir(file_dir)
@@ -169,6 +195,7 @@ def _get_per_user_string():
     except ImportError:
         checksum = _new_md5()
         from os import environ
+
         checksum.update(environ["USERNAME"].encode("utf-8"))
         return checksum.hexdigest()
     else:
@@ -177,19 +204,29 @@ def _get_per_user_string():
 
 def _find_pycuda_include_path():
     from pkg_resources import Requirement, resource_filename
+
     return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda")
 
 
-import os
 DEFAULT_NVCC_FLAGS = [
-        _flag.strip() for _flag in
-        os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split()
-        if _flag.strip()]
-
-
-def compile(source, nvcc="nvcc", options=None, keep=False,
-        no_extern_c=False, arch=None, code=None, cache_dir=None,
-        include_dirs=[], target="cubin"):
+    _flag.strip()
+    for _flag in os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split()
+    if _flag.strip()
+]
+
+
+def compile(
+    source,
+    nvcc="nvcc",
+    options=None,
+    keep=False,
+    no_extern_c=False,
+    arch=None,
+    code=None,
+    cache_dir=None,
+    include_dirs=[],
+    target="cubin",
+):
 
     assert target in ["cubin", "ptx", "fatbin"]
 
@@ -202,13 +239,16 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
     options = options[:]
     if arch is None:
         from pycuda.driver import Error
+
         try:
             from pycuda.driver import Context
+
             arch = "sm_%d%d" % Context.get_device().compute_capability()
         except Error:
             pass
 
     from pycuda.driver import CUDA_DEBUGGING
+
     if CUDA_DEBUGGING:
         cache_dir = False
         keep = True
@@ -222,14 +262,18 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
 
     if cache_dir is None:
         import appdirs
-        cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"),
-                "compiler-cache-v1")
+
+        cache_dir = os.path.join(
+            appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1"
+        )
 
         from os import makedirs
+
         try:
             makedirs(cache_dir)
         except OSError as e:
             from errno import EEXIST
+
             if e.errno != EEXIST:
                 raise
 
@@ -239,32 +283,36 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
     if code is not None:
         options.extend(["-code", code])
 
-    if 'darwin' in sys.platform and sys.maxsize == 9223372036854775807:
-        options.append('-m64')
-    elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807:
-        options.append('-m64')
-    elif 'win32' in sys.platform and sys.maxsize == 2147483647:
-        options.append('-m32')
+    if "darwin" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 9223372036854775807:
+        options.append("-m64")
+    elif "win32" in sys.platform and sys.maxsize == 2147483647:
+        options.append("-m32")
 
     include_dirs = include_dirs + [_find_pycuda_include_path()]
 
     for i in include_dirs:
-        options.append("-I"+i)
+        options.append("-I" + i)
 
     return compile_plain(source, options, keep, nvcc, cache_dir, target)
 
 
-class CudaModule(object):
+class CudaModule:
     def _check_arch(self, arch):
         if arch is None:
             return
         try:
             from pycuda.driver import Context
+
             capability = Context.get_device().compute_capability()
             if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
                 from warnings import warn
-                warn("trying to compile for a compute capability "
-                        "higher than selected GPU")
+
+                warn(
+                    "trying to compile for a compute capability "
+                    "higher than selected GPU"
+                )
         except Exception:
             pass
 
@@ -277,20 +325,41 @@ class CudaModule(object):
     def get_function(self, name):
         return self.module.get_function(name)
 
+
 class SourceModule(CudaModule):
-    '''
+    """
     Creates a Module from a single .cu source object linked against the
     static CUDA runtime.
-    '''
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[]):
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+    ):
         self._check_arch(arch)
 
-        cubin = compile(source, nvcc, options, keep, no_extern_c,
-                arch, code, cache_dir, include_dirs)
+        cubin = compile(
+            source,
+            nvcc,
+            options,
+            keep,
+            no_extern_c,
+            arch,
+            code,
+            cache_dir,
+            include_dirs,
+        )
 
         from pycuda.driver import module_from_buffer
+
         self.module = module_from_buffer(cubin)
 
         self._bind_module()
@@ -318,21 +387,36 @@ def _find_nvcc_on_path():
 
 
 class DynamicModule(CudaModule):
-    '''
+    """
     Creates a Module from multiple .cu source, library file and/or data
     objects linked against the static or dynamic CUDA runtime.
-    '''
-    def __init__(self, nvcc='nvcc', link_options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[],  message_handler=None, log_verbose=False,
-            cuda_libdir=None):
+    """
+
+    def __init__(
+        self,
+        nvcc="nvcc",
+        link_options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        message_handler=None,
+        log_verbose=False,
+        cuda_libdir=None,
+    ):
         from pycuda.driver import Context
+
         compute_capability = Context.get_device().compute_capability()
-        if compute_capability < (3,5):
-            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
-                (compute_capability[0], compute_capability[1]))
+        if compute_capability < (3, 5):
+            raise Exception(
+                "Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!"
+                % (compute_capability[0], compute_capability[1])
+            )
         else:
             from pycuda.driver import Linker
+
             self.linker = Linker(message_handler, link_options, log_verbose)
         self._check_arch(arch)
         self.nvcc = nvcc
@@ -347,7 +431,7 @@ class DynamicModule(CudaModule):
         self.module = None
 
     def _locate_cuda_libdir(self):
-        '''
+        """
         Locate the "standard" CUDA SDK library directory in the local
         file system. Supports 64-Bit Windows, Linux and Mac OS X.
         In case the caller supplied cuda_libdir in the constructor
@@ -371,52 +455,68 @@ class DynamicModule(CudaModule):
           https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
           https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
         - Verify all Linux code paths somehow
-        '''
+        """
         from os.path import isfile, join
         from platform import system as platform_system
+
         system = platform_system()
         libdir, libptn = None, None
-        if system == 'Windows':
+        if system == "Windows":
             if self.cuda_libdir is not None:
                 libdir = self.cuda_libdir
-            elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')):
-                libdir = join(os.environ['CUDA_PATH'], 'lib\\x64')
-            libptn = '%s.lib'
-        elif system in ['Linux', 'Darwin']:
+            elif "CUDA_PATH" in os.environ and isfile(
+                join(os.environ["CUDA_PATH"], "lib\\x64\\cudadevrt.lib")
+            ):
+                libdir = join(os.environ["CUDA_PATH"], "lib\\x64")
+            libptn = "%s.lib"
+        elif system in ["Linux", "Darwin"]:
             if self.cuda_libdir is not None:
                 libdir = self.cuda_libdir
-            elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')):
-                libdir = join(os.environ['CUDA_ROOT'], 'lib64')
-            elif 'LD_LIBRARY_PATH' in os.environ:
-                for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'):
-                    if isfile(join(ld_path, 'libcudadevrt.a')):
+            elif "CUDA_ROOT" in os.environ and isfile(
+                join(os.environ["CUDA_ROOT"], "lib64/libcudadevrt.a")
+            ):
+                libdir = join(os.environ["CUDA_ROOT"], "lib64")
+            elif "LD_LIBRARY_PATH" in os.environ:
+                for ld_path in os.environ["LD_LIBRARY_PATH"].split(":"):
+                    if isfile(join(ld_path, "libcudadevrt.a")):
                         libdir = ld_path
                         break
 
-            if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
-                libdir = '/usr/lib/x86_64-linux-gnu'
+            if libdir is None and isfile("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"):
+                libdir = "/usr/lib/x86_64-linux-gnu"
 
             if libdir is None:
                 nvcc_path = _find_nvcc_on_path()
                 if nvcc_path is not None:
                     libdir = join(os.path.dirname(nvcc_path), "..", "lib64")
 
-            libptn = 'lib%s.a'
+            libptn = "lib%s.a"
         if libdir is None:
-            raise RuntimeError('Unable to locate the CUDA SDK installation '
-                'directory, set CUDA library path manually')
+            raise RuntimeError(
+                "Unable to locate the CUDA SDK installation "
+                "directory, set CUDA library path manually"
+            )
         return libdir, libptn
 
-    def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
-        ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
-            keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
-            code=self.code, cache_dir=self.cache_dir,
-            include_dirs=self.include_dirs, target="ptx")
+    def add_source(self, source, nvcc_options=None, name="kernel.ptx"):
+        ptx = compile(
+            source,
+            nvcc=self.nvcc,
+            options=nvcc_options,
+            keep=self.keep,
+            no_extern_c=self.no_extern_c,
+            arch=self.arch,
+            code=self.code,
+            cache_dir=self.cache_dir,
+            include_dirs=self.include_dirs,
+            target="ptx",
+        )
         from pycuda.driver import jit_input_type
+
         self.linker.add_data(ptx, jit_input_type.PTX, name)
         return self
 
-    def add_data(self, data, input_type, name='unknown'):
+    def add_data(self, data, input_type, name="unknown"):
         self.linker.add_data(data, input_type, name)
         return self
 
@@ -428,10 +528,12 @@ class DynamicModule(CudaModule):
         if self.libdir is None:
             self.libdir, self.libptn = self._locate_cuda_libdir()
         from os.path import isfile, join
+
         libpath = join(self.libdir, self.libptn % libname)
         if not isfile(libpath):
             raise OSError('CUDA SDK library file "%s" not found' % libpath)
         from pycuda.driver import jit_input_type
+
         self.linker.add_file(libpath, jit_input_type.LIBRARY)
         return self
 
@@ -443,28 +545,46 @@ class DynamicModule(CudaModule):
 
 
 class DynamicSourceModule(DynamicModule):
-    '''
+    """
     Creates a Module from a single .cu source object linked against the
     dynamic CUDA runtime.
     - compiler generates PTX relocatable device code (rdc) from source that
       can be linked with other relocatable device code
     - source is linked against the CUDA device runtime library cudadevrt
     - library cudadevrt is statically linked into the generated Module
-    '''
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], cuda_libdir=None):
-        super(DynamicSourceModule, self).__init__(nvcc=nvcc,
-            link_options=None, keep=keep, no_extern_c=no_extern_c,
-            arch=arch, code=code, cache_dir=cache_dir,
-            include_dirs=include_dirs, cuda_libdir=cuda_libdir)
+    """
+
+    def __init__(
+        self,
+        source,
+        nvcc="nvcc",
+        options=None,
+        keep=False,
+        no_extern_c=False,
+        arch=None,
+        code=None,
+        cache_dir=None,
+        include_dirs=[],
+        cuda_libdir=None,
+    ):
+        super().__init__(
+            nvcc=nvcc,
+            link_options=None,
+            keep=keep,
+            no_extern_c=no_extern_c,
+            arch=arch,
+            code=code,
+            cache_dir=cache_dir,
+            include_dirs=include_dirs,
+            cuda_libdir=cuda_libdir,
+        )
         if options is None:
             options = DEFAULT_NVCC_FLAGS
         options = options[:]
-        if '-rdc=true' not in options:
-            options.append('-rdc=true')
-        if '-lcudadevrt' not in options:
-            options.append('-lcudadevrt')
+        if "-rdc=true" not in options:
+            options.append("-rdc=true")
+        if "-lcudadevrt" not in options:
+            options.append("-lcudadevrt")
         self.add_source(source, nvcc_options=options)
-        self.add_stdlib('cudadevrt')
+        self.add_stdlib("cudadevrt")
         self.link()
diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd62a8fdc2b4b6899b5c2f2a3267e5a9dd8..ab2b32c86079b05af501e0eebeabacd7e181d2e7 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import
 import pycuda.gpuarray as gpuarray
 import pycuda.elementwise as elementwise
 import numpy as np
@@ -10,7 +9,9 @@ def _make_unary_array_func(name):
     def f(array, stream_or_out=None, **kwargs):
 
         if stream_or_out is not None:
-            warnings.warn("please use 'out' or 'stream' keyword arguments", DeprecationWarning)
+            warnings.warn(
+                "please use 'out' or 'stream' keyword arguments", DeprecationWarning
+            )
             if isinstance(stream_or_out, Stream):
                 stream = stream_or_out
                 out = None
@@ -19,10 +20,10 @@ def _make_unary_array_func(name):
                 out = stream_or_out
 
         out, stream = None, None
-        if 'out' in kwargs:
-            out = kwargs['out']
-        if 'stream' in kwargs:
-            stream = kwargs['stream']
+        if "out" in kwargs:
+            out = kwargs["out"]
+        if "stream" in kwargs:
+            stream = kwargs["stream"]
 
         if array.dtype == np.float32:
             func_name = name + "f"
@@ -30,8 +31,9 @@ def _make_unary_array_func(name):
             func_name = name
 
         if not array.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         if out is None:
             out = array._new_like_me()
@@ -41,12 +43,20 @@ def _make_unary_array_func(name):
             assert out.shape == array.shape
 
         func = elementwise.get_unary_func_kernel(func_name, array.dtype)
-        func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+        func.prepared_async_call(
+            array._grid,
+            array._block,
+            stream,
+            array.gpudata,
+            out.gpudata,
+            array.mem_size,
+        )
 
         return out
+
     return f
 
+
 fabs = _make_unary_array_func("fabs")
 ceil = _make_unary_array_func("ceil")
 floor = _make_unary_array_func("floor")
@@ -66,70 +76,104 @@ sinh = _make_unary_array_func("sinh")
 cosh = _make_unary_array_func("cosh")
 tanh = _make_unary_array_func("tanh")
 
+
 def fmod(arg, mod, stream=None):
     """Return the floating point remainder of the division `arg/mod`,
     for each element in `arg` and `mod`."""
     result = gpuarray.GPUArray(arg.shape, arg.dtype)
 
     if not arg.flags.forc or not mod.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+        raise RuntimeError(
+            "only contiguous arrays may " "be used as arguments to this operation"
+        )
 
     func = elementwise.get_fmod_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+    func.prepared_async_call(
+        arg._grid,
+        arg._block,
+        stream,
+        arg.gpudata,
+        mod.gpudata,
+        result.gpudata,
+        arg.mem_size,
+    )
 
     return result
 
+
 def frexp(arg, stream=None):
     """Return a tuple `(significands, exponents)` such that
     `arg == significand * 2**exponent`.
     """
     if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+        raise RuntimeError(
+            "only contiguous arrays may " "be used as arguments to this operation"
+        )
 
     sig = gpuarray.GPUArray(arg.shape, arg.dtype)
     expt = gpuarray.GPUArray(arg.shape, arg.dtype)
 
     func = elementwise.get_frexp_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+    func.prepared_async_call(
+        arg._grid,
+        arg._block,
+        stream,
+        arg.gpudata,
+        sig.gpudata,
+        expt.gpudata,
+        arg.mem_size,
+    )
 
     return sig, expt
 
+
 def ldexp(significand, exponent, stream=None):
     """Return a new array of floating point values composed from the
     entries of `significand` and `exponent`, paired together as
     `result = significand * 2**exponent`.
     """
     if not significand.flags.forc or not exponent.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+        raise RuntimeError(
+            "only contiguous arrays may " "be used as arguments to this operation"
+        )
 
     result = gpuarray.GPUArray(significand.shape, significand.dtype)
 
     func = elementwise.get_ldexp_kernel()
-    func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
-            significand.mem_size)
+    func.prepared_async_call(
+        significand._grid,
+        significand._block,
+        stream,
+        significand.gpudata,
+        exponent.gpudata,
+        result.gpudata,
+        significand.mem_size,
+    )
 
     return result
 
+
 def modf(arg, stream=None):
     """Return a tuple `(fracpart, intpart)` of arrays containing the
     integer and fractional parts of `arg`.
     """
     if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+        raise RuntimeError(
+            "only contiguous arrays may " "be used as arguments to this operation"
+        )
 
     intpart = gpuarray.GPUArray(arg.shape, arg.dtype)
     fracpart = gpuarray.GPUArray(arg.shape, arg.dtype)
 
     func = elementwise.get_modf_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
-            arg.mem_size)
+    func.prepared_async_call(
+        arg._grid,
+        arg._block,
+        stream,
+        arg.gpudata,
+        intpart.gpudata,
+        fracpart.gpudata,
+        arg.mem_size,
+    )
 
     return fracpart, intpart
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index a5d3d38d0a241522251999037eef4d0f86a73ee2..fe6f54b5be39dd82e40b7b1d12cfee7acbd58ff5 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -1,13 +1,8 @@
-from __future__ import division
-from __future__ import absolute_import
-
 import numpy as np
 import pycuda.compiler
 import pycuda.driver as drv
 import pycuda.gpuarray as array
 from pytools import memoize_method
-import six
-
 
 
 # {{{ MD5-based random number generation
@@ -186,8 +181,6 @@ md5_code = """
 """
 
 
-
-
 def rand(shape, dtype=np.float32, stream=None):
     from pycuda.gpuarray import GPUArray
     from pycuda.elementwise import get_elwise_kernel
@@ -197,7 +190,8 @@ def rand(shape, dtype=np.float32, stream=None):
     if dtype == np.float32:
         func = get_elwise_kernel(
             "float *dest, unsigned int seed",
-            md5_code + """
+            md5_code
+            + """
             #define POW_2_M32 (1/4294967296.0f)
             dest[i] = a*POW_2_M32;
             if ((i += total_threads) < n)
@@ -207,11 +201,13 @@ def rand(shape, dtype=np.float32, stream=None):
             if ((i += total_threads) < n)
                 dest[i] = d*POW_2_M32;
             """,
-            "md5_rng_float")
+            "md5_rng_float",
+        )
     elif dtype == np.float64:
         func = get_elwise_kernel(
             "double *dest, unsigned int seed",
-            md5_code + """
+            md5_code
+            + """
             #define POW_2_M32 (1/4294967296.0)
             #define POW_2_M64 (1/18446744073709551616.)
 
@@ -222,11 +218,13 @@ def rand(shape, dtype=np.float32, stream=None):
               dest[i] = c*POW_2_M32 + d*POW_2_M64;
             }
             """,
-            "md5_rng_float")
+            "md5_rng_float",
+        )
     elif dtype in [np.int32, np.uint32]:
         func = get_elwise_kernel(
             "unsigned int *dest, unsigned int seed",
-            md5_code + """
+            md5_code
+            + """
             dest[i] = a;
             if ((i += total_threads) < n)
                 dest[i] = b;
@@ -235,24 +233,35 @@ def rand(shape, dtype=np.float32, stream=None):
             if ((i += total_threads) < n)
                 dest[i] = d;
             """,
-            "md5_rng_int")
+            "md5_rng_int",
+        )
     else:
-        raise NotImplementedError;
+        raise NotImplementedError
 
-    func.prepared_async_call(result._grid, result._block, stream,
-            result.gpudata, np.random.randint(2**31-1), result.size)
+    func.prepared_async_call(
+        result._grid,
+        result._block,
+        stream,
+        result.gpudata,
+        np.random.randint(2 ** 31 - 1),
+        result.size,
+    )
 
     return result
 
+
 # }}}
 
 # {{{ CURAND wrapper
 
 try:
-    import pycuda._driver as _curand # used to be separate module
+    import pycuda._driver as _curand  # used to be separate module
 except ImportError:
+
     def get_curand_version():
         return None
+
+
 else:
     get_curand_version = _curand.get_curand_version
 
@@ -355,7 +364,8 @@ __global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned
 }
 """
 
-class _RandomNumberGeneratorBase(object):
+
+class _RandomNumberGeneratorBase:
     """
     Class surrounding CURAND kernels from CUDA 3.2.
     It allows for generating random numbers with uniform
@@ -371,34 +381,41 @@ class _RandomNumberGeneratorBase(object):
         ("normal_double", "double", "_normal_double"),
         ("normal_float2", "float2", "_normal2"),
         ("normal_double2", "double2", "_normal2_double"),
-        ]
+    ]
 
     gen_log_info = [
         ("normal_log_float", "float", "float", "_normal"),
         ("normal_log_double", "double", "double", "_normal_double"),
         ("normal_log_float2", "float", "float2", "_normal2"),
         ("normal_log_double2", "double", "double2", "_normal2_double"),
-        ]
+    ]
 
     gen_poisson_info = [
         ("poisson_int", "unsigned int", ""),
-        ]
+    ]
 
     gen_poisson_inplace_info = [
         ("poisson_inplace_float", "float", ""),
         ("poisson_inplace_double", "double", ""),
         ("poisson_inplace_int", "unsigned int", ""),
-        ]
-
-    def __init__(self, state_type, vector_type, generator_bits,
-        additional_source, scramble_type=None):
+    ]
+
+    def __init__(
+        self,
+        state_type,
+        vector_type,
+        generator_bits,
+        additional_source,
+        scramble_type=None,
+    ):
         if get_curand_version() < (3, 2, 0):
-            raise EnvironmentError("Need at least CUDA 3.2")
+            raise OSError("Need at least CUDA 3.2")
 
         dev = drv.Context.get_device()
 
         self.block_count = dev.get_attribute(
-            pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)
+            pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT
+        )
 
         from pycuda.characterize import has_double_support
 
@@ -411,67 +428,99 @@ class _RandomNumberGeneratorBase(object):
             return result
 
         my_generators = [
-                (name, out_type, suffix)
-                for name, out_type, suffix in self.gen_info
-                if do_generate(out_type)]
+            (name, out_type, suffix)
+            for name, out_type, suffix in self.gen_info
+            if do_generate(out_type)
+        ]
 
         if get_curand_version() >= (4, 0, 0):
             my_log_generators = [
-                    (name, in_type, out_type, suffix)
-                    for name, in_type, out_type, suffix in self.gen_log_info
-                    if do_generate(out_type)]
+                (name, in_type, out_type, suffix)
+                for name, in_type, out_type, suffix in self.gen_log_info
+                if do_generate(out_type)
+            ]
 
         if get_curand_version() >= (5, 0, 0):
             my_poisson_generators = [
-                    (name, out_type, suffix)
-                    for name, out_type, suffix in self.gen_poisson_info
-                    if do_generate(out_type)]
+                (name, out_type, suffix)
+                for name, out_type, suffix in self.gen_poisson_info
+                if do_generate(out_type)
+            ]
             my_poisson_inplace_generators = [
-                    (name, inout_type, suffix)
-                    for name, inout_type, suffix in self.gen_poisson_inplace_info
-                    if do_generate(inout_type)]
+                (name, inout_type, suffix)
+                for name, inout_type, suffix in self.gen_poisson_inplace_info
+                if do_generate(inout_type)
+            ]
 
         generator_sources = [
-                gen_template % {
-                    "name": name, "out_type": out_type, "suffix": suffix,
-                    "state_type": state_type, }
-                for name, out_type, suffix in my_generators]
-        
+            gen_template
+            % {
+                "name": name,
+                "out_type": out_type,
+                "suffix": suffix,
+                "state_type": state_type,
+            }
+            for name, out_type, suffix in my_generators
+        ]
+
         if get_curand_version() >= (4, 0, 0):
-            generator_sources.extend([
-                    gen_log_template % {
-                        "name": name, "in_type": in_type, "out_type": out_type,
-                        "suffix": suffix, "state_type": state_type, }
-                    for name, in_type, out_type, suffix in my_log_generators])
+            generator_sources.extend(
+                [
+                    gen_log_template
+                    % {
+                        "name": name,
+                        "in_type": in_type,
+                        "out_type": out_type,
+                        "suffix": suffix,
+                        "state_type": state_type,
+                    }
+                    for name, in_type, out_type, suffix in my_log_generators
+                ]
+            )
 
         if get_curand_version() >= (5, 0, 0):
-            generator_sources.extend([
-                    gen_poisson_template % {
-                        "name": name, "out_type": out_type, "suffix": suffix,
-                        "state_type": state_type, }
-                    for name, out_type, suffix in my_poisson_generators])
-            generator_sources.extend([
-                    gen_poisson_inplace_template % {
-                        "name": name, "inout_type": inout_type, "suffix": suffix,
-                        "state_type": state_type, }
-                    for name, inout_type, suffix in my_poisson_inplace_generators])
+            generator_sources.extend(
+                [
+                    gen_poisson_template
+                    % {
+                        "name": name,
+                        "out_type": out_type,
+                        "suffix": suffix,
+                        "state_type": state_type,
+                    }
+                    for name, out_type, suffix in my_poisson_generators
+                ]
+            )
+            generator_sources.extend(
+                [
+                    gen_poisson_inplace_template
+                    % {
+                        "name": name,
+                        "inout_type": inout_type,
+                        "suffix": suffix,
+                        "state_type": state_type,
+                    }
+                    for name, inout_type, suffix in my_poisson_inplace_generators
+                ]
+            )
 
         source = (random_source + additional_source) % {
             "state_type": state_type,
             "vector_type": vector_type,
             "scramble_type": scramble_type,
-            "generators": "\n".join(generator_sources)}
+            "generators": "\n".join(generator_sources),
+        }
 
         # store in instance to let subclass constructors get to it.
         self.module = module = pycuda.compiler.SourceModule(source, no_extern_c=True)
 
         self.generators = {}
-        for name, out_type, suffix  in my_generators:
+        for name, out_type, suffix in my_generators:
             gen_func = module.get_function(name)
             gen_func.prepare("PPi")
             self.generators[name] = gen_func
         if get_curand_version() >= (4, 0, 0):
-            for name, in_type, out_type, suffix  in my_log_generators:
+            for name, in_type, out_type, suffix in my_log_generators:
                 gen_func = module.get_function(name)
                 if in_type == "float":
                     gen_func.prepare("PPffi")
@@ -479,11 +528,11 @@ class _RandomNumberGeneratorBase(object):
                     gen_func.prepare("PPddi")
                 self.generators[name] = gen_func
         if get_curand_version() >= (5, 0, 0):
-            for name, out_type, suffix  in my_poisson_generators:
+            for name, out_type, suffix in my_poisson_generators:
                 gen_func = module.get_function(name)
                 gen_func.prepare("PPdi")
                 self.generators[name] = gen_func
-            for name, inout_type, suffix  in my_poisson_inplace_generators:
+            for name, inout_type, suffix in my_poisson_inplace_generators:
                 gen_func = module.get_function(name)
                 gen_func.prepare("PPi")
                 self.generators[name] = gen_func
@@ -504,24 +553,26 @@ class _RandomNumberGeneratorBase(object):
         self.skip_ahead_array.prepare("PiP")
 
     def _kernels(self):
-        return (
-                list(six.itervalues(self.generators))
-                + [self.skip_ahead, self.skip_ahead_array])
+        return list(self.generators.values()) + [
+            self.skip_ahead,
+            self.skip_ahead_array,
+        ]
 
     @property
     @memoize_method
     def generators_per_block(self):
-        return min(kernel.max_threads_per_block
-                for kernel in self._kernels())
+        return min(kernel.max_threads_per_block for kernel in self._kernels())
 
     @property
     def state(self):
         if self._state is None:
             from pycuda.characterize import sizeof
+
             data_type_size = sizeof(self.state_type, "#include <curand_kernel.h>")
 
             self._state = drv.mem_alloc(
-                self.block_count * self.generators_per_block * data_type_size)
+                self.block_count * self.generators_per_block * data_type_size
+            )
 
         return self._state
 
@@ -538,8 +589,13 @@ class _RandomNumberGeneratorBase(object):
             raise NotImplementedError
 
         func.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, data.gpudata, data.size)
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            data.gpudata,
+            data.size,
+        )
 
     def fill_normal(self, data, stream=None):
         if data.dtype == np.float32:
@@ -557,8 +613,13 @@ class _RandomNumberGeneratorBase(object):
         func = self.generators[func_name]
 
         func.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, data.gpudata, int(data_size))
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            data.gpudata,
+            int(data_size),
+        )
 
     def gen_uniform(self, shape, dtype, stream=None):
         result = array.empty(shape, dtype)
@@ -571,6 +632,7 @@ class _RandomNumberGeneratorBase(object):
         return result
 
     if get_curand_version() >= (4, 0, 0):
+
         def fill_log_normal(self, data, mean, stddev, stream=None):
             if data.dtype == np.float32:
                 func_name = "normal_log_float"
@@ -587,8 +649,15 @@ class _RandomNumberGeneratorBase(object):
             func = self.generators[func_name]
 
             func.prepared_async_call(
-                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                    self.state, data.gpudata, mean, stddev, int(data_size))
+                (self.block_count, 1),
+                (self.generators_per_block, 1, 1),
+                stream,
+                self.state,
+                data.gpudata,
+                mean,
+                stddev,
+                int(data_size),
+            )
 
         def gen_log_normal(self, shape, dtype, mean, stddev, stream=None):
             result = array.empty(shape, dtype)
@@ -596,6 +665,7 @@ class _RandomNumberGeneratorBase(object):
             return result
 
     if get_curand_version() >= (5, 0, 0):
+
         def fill_poisson(self, data, lambda_value=None, stream=None):
             if lambda_value is None:
                 if data.dtype == np.float32:
@@ -616,12 +686,23 @@ class _RandomNumberGeneratorBase(object):
 
             if lambda_value is None:
                 func.prepared_async_call(
-                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                    self.state, data.gpudata, data.size)
+                    (self.block_count, 1),
+                    (self.generators_per_block, 1, 1),
+                    stream,
+                    self.state,
+                    data.gpudata,
+                    data.size,
+                )
             else:
                 func.prepared_async_call(
-                        (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                        self.state, data.gpudata, lambda_value, data.size)
+                    (self.block_count, 1),
+                    (self.generators_per_block, 1, 1),
+                    stream,
+                    self.state,
+                    data.gpudata,
+                    lambda_value,
+                    data.size,
+                )
 
         def gen_poisson(self, shape, dtype, lambda_value, stream=None):
             result = array.empty(shape, dtype)
@@ -630,44 +711,68 @@ class _RandomNumberGeneratorBase(object):
 
     def call_skip_ahead(self, i, stream=None):
         self.skip_ahead.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, self.generators_per_block, i)
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            self.generators_per_block,
+            i,
+        )
 
     def call_skip_ahead_array(self, i, stream=None):
         self.skip_ahead_array.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, self.generators_per_block, i.gpudata)
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            self.generators_per_block,
+            i.gpudata,
+        )
+
 
 # }}}
 
 # {{{ XORWOW RNG
 
-class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
-    def __init__(self, seed_getter, offset, state_type, vector_type,
-        generator_bits, additional_source, scramble_type=None):
 
-        super(_PseudoRandomNumberGeneratorBase, self).__init__(
-            state_type, vector_type, generator_bits, additional_source)
+class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+    def __init__(
+        self,
+        seed_getter,
+        offset,
+        state_type,
+        vector_type,
+        generator_bits,
+        additional_source,
+        scramble_type=None,
+    ):
+
+        super().__init__(
+            state_type, vector_type, generator_bits, additional_source
+        )
 
         generator_count = self.generators_per_block * self.block_count
         if seed_getter is None:
             seed = array.to_gpu(
-                    np.asarray(
-                        np.random.randint(
-                            0, (1 << 31) - 1, generator_count),
-                        dtype=np.int32))
+                np.asarray(
+                    np.random.randint(0, (1 << 31) - 1, generator_count), dtype=np.int32
+                )
+            )
         else:
             seed = seed_getter(generator_count)
 
-        if not (isinstance(seed, pycuda.gpuarray.GPUArray)
-                and seed.dtype == np.int32
-                and seed.size == generator_count):
+        if not (
+            isinstance(seed, pycuda.gpuarray.GPUArray)
+            and seed.dtype == np.int32
+            and seed.size == generator_count
+        ):
             raise TypeError("seed must be GPUArray of integers of right length")
 
         p = self.module.get_function("prepare")
         p.prepare("PiPi")
 
         from pycuda.characterize import has_stack
+
         has_stack = has_stack()
 
         if has_stack:
@@ -675,11 +780,16 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
         try:
             if has_stack:
-                drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+                drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14)  # 16k
             try:
                 p.prepared_call(
-                        (self.block_count, 1), (self.generators_per_block, 1, 1), self.state,
-                        generator_count, seed.gpudata, offset)
+                    (self.block_count, 1),
+                    (self.generators_per_block, 1, 1),
+                    self.state,
+                    generator_count,
+                    seed.gpudata,
+                    offset,
+                )
             except drv.LaunchError:
                 raise ValueError("Initialisation failed. Decrease number of threads.")
 
@@ -694,36 +804,55 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
         self.skip_ahead_array.prepare("PiP")
         self.skip_ahead_sequence = self.module.get_function("skip_ahead_sequence")
         self.skip_ahead_sequence.prepare("PiQ")
-        self.skip_ahead_sequence_array = self.module.get_function("skip_ahead_sequence_array")
+        self.skip_ahead_sequence_array = self.module.get_function(
+            "skip_ahead_sequence_array"
+        )
         self.skip_ahead_sequence_array.prepare("PiP")
 
     def call_skip_ahead_sequence(self, i, stream=None):
         self.skip_ahead_sequence.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, self.generators_per_block * self.block_count, i)
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            self.generators_per_block * self.block_count,
+            i,
+        )
 
     def call_skip_ahead_sequence_array(self, i, stream=None):
         self.skip_ahead_sequence_array.prepared_async_call(
-                (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                self.state, self.generators_per_block * self.block_count, i.gpudata)
+            (self.block_count, 1),
+            (self.generators_per_block, 1, 1),
+            stream,
+            self.state,
+            self.generators_per_block * self.block_count,
+            i.gpudata,
+        )
 
     def _kernels(self):
-        return (_RandomNumberGeneratorBase._kernels(self)
-                + [self.module.get_function("prepare")]
-                + [self.module.get_function("skip_ahead_sequence"),
-                   self.module.get_function("skip_ahead_sequence_array")])
+        return (
+            _RandomNumberGeneratorBase._kernels(self)
+            + [self.module.get_function("prepare")]
+            + [
+                self.module.get_function("skip_ahead_sequence"),
+                self.module.get_function("skip_ahead_sequence_array"),
+            ]
+        )
 
 
-def seed_getter_uniform(N):
-    result = pycuda.gpuarray.empty([N], np.int32)
+def seed_getter_uniform(n):
+    result = pycuda.gpuarray.empty([n], np.int32)
     import random
-    value = random.randint(0, 2**31-1)
+
+    value = random.randint(0, 2 ** 31 - 1)
     return result.fill(value)
 
-def seed_getter_unique(N):
-    result = np.random.randint(0, 2**31-1, N).astype(np.int32)
+
+def seed_getter_unique(n):
+    result = np.random.randint(0, 2 ** 31 - 1, n).astype(np.int32)
     return pycuda.gpuarray.to_gpu(result)
 
+
 xorwow_random_source = """
 extern "C" {
 __global__ void prepare(%(state_type)s *s, const int n,
@@ -755,6 +884,7 @@ __global__ void skip_ahead_sequence_array(%(state_type)s *s, const int n, const
 """
 
 if get_curand_version() >= (3, 2, 0):
+
     class XORWOWRandomNumberGenerator(_PseudoRandomNumberGeneratorBase):
         has_box_muller = True
 
@@ -764,10 +894,17 @@ if get_curand_version() >= (3, 2, 0):
               :class:`GPUArray` of seeds.
             """
 
-            super(XORWOWRandomNumberGenerator, self).__init__(
-                seed_getter, offset,
-                'curandStateXORWOW', 'unsigned int', 32, xorwow_random_source+
-                xorwow_skip_ahead_sequence_source+random_skip_ahead64_source)
+            super().__init__(
+                seed_getter,
+                offset,
+                "curandStateXORWOW",
+                "unsigned int",
+                32,
+                xorwow_random_source
+                + xorwow_skip_ahead_sequence_source
+                + random_skip_ahead64_source,
+            )
+
 
 # }}}
 
@@ -818,6 +955,7 @@ __global__ void skip_ahead_subsequence_array(%(state_type)s *s, const int n, con
 """
 
 if get_curand_version() >= (4, 1, 0):
+
     class MRG32k3aRandomNumberGenerator(_PseudoRandomNumberGeneratorBase):
         has_box_muller = True
 
@@ -827,41 +965,66 @@ if get_curand_version() >= (4, 1, 0):
               :class:`GPUArray` of seeds.
             """
 
-            super(MRG32k3aRandomNumberGenerator, self).__init__(
-                seed_getter, offset,
-                'curandStateMRG32k3a', 'unsigned int', 32, mrg32k3a_random_source+
-                mrg32k3a_skip_ahead_sequence_source+random_skip_ahead64_source)
+            super().__init__(
+                seed_getter,
+                offset,
+                "curandStateMRG32k3a",
+                "unsigned int",
+                32,
+                mrg32k3a_random_source
+                + mrg32k3a_skip_ahead_sequence_source
+                + random_skip_ahead64_source,
+            )
 
         def _prepare_skipahead(self):
-            super(MRG32k3aRandomNumberGenerator, self)._prepare_skipahead()
-            self.skip_ahead_subsequence = self.module.get_function("skip_ahead_subsequence")
+            super()._prepare_skipahead()
+            self.skip_ahead_subsequence = self.module.get_function(
+                "skip_ahead_subsequence"
+            )
             self.skip_ahead_subsequence.prepare("PiQ")
-            self.skip_ahead_subsequence_array = self.module.get_function("skip_ahead_subsequence_array")
+            self.skip_ahead_subsequence_array = self.module.get_function(
+                "skip_ahead_subsequence_array"
+            )
             self.skip_ahead_subsequence_array.prepare("PiP")
 
         def call_skip_ahead_subsequence(self, i, stream=None):
             self.skip_ahead_subsequence.prepared_async_call(
-                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                    self.state, self.generators_per_block * self.block_count, i)
+                (self.block_count, 1),
+                (self.generators_per_block, 1, 1),
+                stream,
+                self.state,
+                self.generators_per_block * self.block_count,
+                i,
+            )
 
         def call_skip_ahead_subsequence_array(self, i, stream=None):
             self.skip_ahead_subsequence_array.prepared_async_call(
-                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                    self.state, self.generators_per_block * self.block_count, i.gpudata)
+                (self.block_count, 1),
+                (self.generators_per_block, 1, 1),
+                stream,
+                self.state,
+                self.generators_per_block * self.block_count,
+                i.gpudata,
+            )
 
         def _kernels(self):
-            return (_PseudoRandomNumberGeneratorBase._kernels(self)
-                    + [self.module.get_function("skip_ahead_subsequence"),
-                       self.module.get_function("skip_ahead_subsequence_array")])
+            return _PseudoRandomNumberGeneratorBase._kernels(self) + [
+                self.module.get_function("skip_ahead_subsequence"),
+                self.module.get_function("skip_ahead_subsequence_array"),
+            ]
+
 
 # }}}
 
 # {{{ Sobol RNG
 
+
 def generate_direction_vectors(count, direction=None):
     if get_curand_version() >= (4, 0, 0):
-        if direction == direction_vector_set.VECTOR_64 or \
-            direction == direction_vector_set.SCRAMBLED_VECTOR_64:
+        if (
+            direction == direction_vector_set.VECTOR_64
+            or direction == direction_vector_set.SCRAMBLED_VECTOR_64
+        ):
             result = np.empty((count, 64), dtype=np.uint64)
         else:
             result = np.empty((count, 32), dtype=np.uint32)
@@ -870,17 +1033,20 @@ def generate_direction_vectors(count, direction=None):
     _get_direction_vectors(direction, result, count)
     return pycuda.gpuarray.to_gpu(result)
 
+
 if get_curand_version() >= (4, 0, 0):
+
     def generate_scramble_constants32(count):
-        result = np.empty((count, ), dtype=np.uint32)
+        result = np.empty((count,), dtype=np.uint32)
         _get_scramble_constants32(result, count)
         return pycuda.gpuarray.to_gpu(result)
 
     def generate_scramble_constants64(count):
-        result = np.empty((count, ), dtype=np.uint64)
+        result = np.empty((count,), dtype=np.uint64)
         _get_scramble_constants64(result, count)
         return pycuda.gpuarray.to_gpu(result)
 
+
 sobol_random_source = """
 extern "C" {
 __global__ void prepare(%(state_type)s *s, const int n,
@@ -893,6 +1059,7 @@ __global__ void prepare(%(state_type)s *s, const int n,
 }
 """
 
+
 class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
     """
     Class surrounding CURAND kernels from CUDA 3.2.
@@ -902,25 +1069,40 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
     has_box_muller = False
 
-    def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
-        dir_vector_set, offset, state_type, vector_type, generator_bits,
-        sobol_random_source):
-        super(_SobolRandomNumberGeneratorBase, self).__init__(state_type,
-            vector_type, generator_bits, sobol_random_source)
+    def __init__(
+        self,
+        dir_vector,
+        dir_vector_dtype,
+        dir_vector_size,
+        dir_vector_set,
+        offset,
+        state_type,
+        vector_type,
+        generator_bits,
+        sobol_random_source,
+    ):
+        super().__init__(
+            state_type, vector_type, generator_bits, sobol_random_source
+        )
 
         if dir_vector is None:
             dir_vector = generate_direction_vectors(
-                self.block_count * self.generators_per_block, dir_vector_set)
-
-        if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
-                and dir_vector.dtype == dir_vector_dtype
-                and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+                self.block_count * self.generators_per_block, dir_vector_set
+            )
+
+        if not (
+            isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+            and dir_vector.dtype == dir_vector_dtype
+            and dir_vector.shape
+            == (self.block_count * self.generators_per_block, dir_vector_size)
+        ):
             raise TypeError("seed must be GPUArray of integers of right length")
 
         p = self.module.get_function("prepare")
         p.prepare("PiPi")
 
         from pycuda.characterize import has_stack
+
         has_stack = has_stack()
 
         if has_stack:
@@ -928,11 +1110,16 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
         try:
             if has_stack:
-                drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+                drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14)  # 16k
             try:
-                p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1),
-                    self.state, self.block_count * self.generators_per_block,
-                    dir_vector.gpudata, offset)
+                p.prepared_call(
+                    (self.block_count, 1),
+                    (self.generators_per_block, 1, 1),
+                    self.state,
+                    self.block_count * self.generators_per_block,
+                    dir_vector.gpudata,
+                    offset,
+                )
             except drv.LaunchError:
                 raise ValueError("Initialisation failed. Decrease number of threads.")
 
@@ -941,8 +1128,10 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
                 drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
 
     def _kernels(self):
-        return (_RandomNumberGeneratorBase._kernels(self)
-                + [self.module.get_function("prepare")])
+        return _RandomNumberGeneratorBase._kernels(self) + [
+            self.module.get_function("prepare")
+        ]
+
 
 scrambledsobol_random_source = """
 extern "C" {
@@ -956,6 +1145,7 @@ __global__ void prepare( %(state_type)s *s, const int n,
 }
 """
 
+
 class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
     """
     Class surrounding CURAND kernels from CUDA 4.0.
@@ -965,36 +1155,55 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
     has_box_muller = False
 
-    def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
-        dir_vector_set, scramble_vector, scramble_vector_function,
-        offset, state_type, vector_type, generator_bits, scramble_type,
-	sobol_random_source):
-        super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type,
-            vector_type, generator_bits, sobol_random_source, scramble_type)
+    def __init__(
+        self,
+        dir_vector,
+        dir_vector_dtype,
+        dir_vector_size,
+        dir_vector_set,
+        scramble_vector,
+        scramble_vector_function,
+        offset,
+        state_type,
+        vector_type,
+        generator_bits,
+        scramble_type,
+        sobol_random_source,
+    ):
+        super().__init__(
+            state_type, vector_type, generator_bits, sobol_random_source, scramble_type
+        )
 
         if dir_vector is None:
             dir_vector = generate_direction_vectors(
-                self.block_count * self.generators_per_block,
-                dir_vector_set)
+                self.block_count * self.generators_per_block, dir_vector_set
+            )
 
         if scramble_vector is None:
             scramble_vector = scramble_vector_function(
-                self.block_count * self.generators_per_block)
-
-        if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
-                and dir_vector.dtype == dir_vector_dtype
-                and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+                self.block_count * self.generators_per_block
+            )
+
+        if not (
+            isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+            and dir_vector.dtype == dir_vector_dtype
+            and dir_vector.shape
+            == (self.block_count * self.generators_per_block, dir_vector_size)
+        ):
             raise TypeError("seed must be GPUArray of integers of right length")
 
-        if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray)
-                and scramble_vector.dtype == dir_vector_dtype
-                and scramble_vector.shape == (self.block_count * self.generators_per_block, )):
+        if not (
+            isinstance(scramble_vector, pycuda.gpuarray.GPUArray)
+            and scramble_vector.dtype == dir_vector_dtype
+            and scramble_vector.shape == (self.block_count * self.generators_per_block,)
+        ):
             raise TypeError("scramble must be GPUArray of integers of right length")
 
         p = self.module.get_function("prepare")
         p.prepare("PiPPi")
 
         from pycuda.characterize import has_stack
+
         has_stack = has_stack()
 
         if has_stack:
@@ -1002,11 +1211,17 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
         try:
             if has_stack:
-                drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+                drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14)  # 16k
             try:
-                p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1),
-                    self.state, self.block_count * self.generators_per_block,
-                    dir_vector.gpudata, scramble_vector.gpudata, offset)
+                p.prepared_call(
+                    (self.block_count, 1),
+                    (self.generators_per_block, 1, 1),
+                    self.state,
+                    self.block_count * self.generators_per_block,
+                    dir_vector.gpudata,
+                    scramble_vector.gpudata,
+                    offset,
+                )
             except drv.LaunchError:
                 raise ValueError("Initialisation failed. Decrease number of threads.")
 
@@ -1015,10 +1230,13 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
                 drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
 
     def _kernels(self):
-        return (_RandomNumberGeneratorBase._kernels(self)
-                + [self.module.get_function("prepare")])
+        return _RandomNumberGeneratorBase._kernels(self) + [
+            self.module.get_function("prepare")
+        ]
+
 
 if get_curand_version() >= (3, 2, 0):
+
     class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
         """
         Class surrounding CURAND kernels from CUDA 3.2.
@@ -1027,14 +1245,24 @@ if get_curand_version() >= (3, 2, 0):
         """
 
         def __init__(self, dir_vector=None, offset=0):
-            super(Sobol32RandomNumberGenerator, self).__init__(dir_vector,
-                np.uint32, 32, direction_vector_set.VECTOR_32, offset,
-                'curandStateSobol32', 'curandDirectionVectors32_t', 32,
-                sobol_random_source+random_skip_ahead32_source)
+            super().__init__(
+                dir_vector,
+                np.uint32,
+                32,
+                direction_vector_set.VECTOR_32,
+                offset,
+                "curandStateSobol32",
+                "curandDirectionVectors32_t",
+                32,
+                sobol_random_source + random_skip_ahead32_source,
+            )
 
 
 if get_curand_version() >= (4, 0, 0):
-    class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+
+    class ScrambledSobol32RandomNumberGenerator(
+        _ScrambledSobolRandomNumberGeneratorBase
+    ):
         """
         Class surrounding CURAND kernels from CUDA 4.0.
         It allows for generating quasi-random numbers with uniform
@@ -1042,14 +1270,24 @@ if get_curand_version() >= (4, 0, 0):
         """
 
         def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
-            super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector,
-                np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32,
-                scramble_vector, generate_scramble_constants32, offset,
-                'curandStateScrambledSobol32', 'curandDirectionVectors32_t',
-                32, 'unsigned int',
-                scrambledsobol_random_source+random_skip_ahead32_source)
+            super().__init__(
+                dir_vector,
+                np.uint32,
+                32,
+                direction_vector_set.SCRAMBLED_VECTOR_32,
+                scramble_vector,
+                generate_scramble_constants32,
+                offset,
+                "curandStateScrambledSobol32",
+                "curandDirectionVectors32_t",
+                32,
+                "unsigned int",
+                scrambledsobol_random_source + random_skip_ahead32_source,
+            )
+
 
 if get_curand_version() >= (4, 0, 0):
+
     class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
         """
         Class surrounding CURAND kernels from CUDA 4.0.
@@ -1058,13 +1296,24 @@ if get_curand_version() >= (4, 0, 0):
         """
 
         def __init__(self, dir_vector=None, offset=0):
-            super(Sobol64RandomNumberGenerator, self).__init__(dir_vector,
-                np.uint64, 64, direction_vector_set.VECTOR_64, offset,
-                'curandStateSobol64', 'curandDirectionVectors64_t', 64,
-                 sobol_random_source+random_skip_ahead64_source)
+            super().__init__(
+                dir_vector,
+                np.uint64,
+                64,
+                direction_vector_set.VECTOR_64,
+                offset,
+                "curandStateSobol64",
+                "curandDirectionVectors64_t",
+                64,
+                sobol_random_source + random_skip_ahead64_source,
+            )
+
 
 if get_curand_version() >= (4, 0, 0):
-    class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+
+    class ScrambledSobol64RandomNumberGenerator(
+        _ScrambledSobolRandomNumberGeneratorBase
+    ):
         """
         Class surrounding CURAND kernels from CUDA 4.0.
         It allows for generating quasi-random numbers with uniform
@@ -1072,19 +1321,25 @@ if get_curand_version() >= (4, 0, 0):
         """
 
         def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
-            super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector,
-                np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64,
-                scramble_vector, generate_scramble_constants64, offset,
-                'curandStateScrambledSobol64', 'curandDirectionVectors64_t',
-                64, 'unsigned long long',
-                scrambledsobol_random_source+random_skip_ahead64_source)
+            super().__init__(
+                dir_vector,
+                np.uint64,
+                64,
+                direction_vector_set.SCRAMBLED_VECTOR_64,
+                scramble_vector,
+                generate_scramble_constants64,
+                offset,
+                "curandStateScrambledSobol64",
+                "curandDirectionVectors64_t",
+                64,
+                "unsigned long long",
+                scrambledsobol_random_source + random_skip_ahead64_source,
+            )
 
-# }}}
 
 # }}}
 
-
-
+# }}}
 
 
 # vim: foldmethod=marker
diff --git a/pycuda/debug.py b/pycuda/debug.py
index b2a90b9b0a3463d34ac3cb99a62708ca7ea27e4a..dee5974c93efc3c5d239df2782abfe3f0aed9071 100644
--- a/pycuda/debug.py
+++ b/pycuda/debug.py
@@ -1,13 +1,12 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import pycuda.driver
-pycuda.driver.set_debugging()
 
 import sys
-
 from optparse import OptionParser
-parser = OptionParser(
-        usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]")
+from os.path import exists
+
+pycuda.driver.set_debugging()
+
+parser = OptionParser(usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]")
 
 parser.disable_interspersed_args()
 options, args = parser.parse_args()
@@ -16,12 +15,12 @@ if len(args) < 1:
     parser.print_help()
     sys.exit(2)
 
-mainpyfile =  args[0]
-from os.path import exists
+mainpyfile = args[0]
+
 if not exists(mainpyfile):
-    print('Error:', mainpyfile, 'does not exist')
+    print("Error:", mainpyfile, "does not exist")
     sys.exit(1)
 
 sys.argv = args
 
-exec(compile(open(mainpyfile).read(), mainpyfile, 'exec'))
+exec(compile(open(mainpyfile).read(), mainpyfile, "exec"))
diff --git a/pycuda/driver.py b/pycuda/driver.py
index fd042a758436fcbbd3a658f47ad71442a5fa62ef..6bfd097e560798ed6a658628fec0536b279b29e5 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -1,15 +1,10 @@
-from __future__ import absolute_import, print_function
-
 import os
-import sys
-
-import six
-
 import numpy as np
 
 
 # {{{ add cuda lib dir to Python DLL path
 
+
 def _search_on_path(filenames):
     """Find file on system path."""
     # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
@@ -32,7 +27,7 @@ def _add_cuda_libdir_to_dll_path():
     cuda_path = os.environ.get("CUDA_PATH")
 
     if cuda_path is not None:
-        os.add_dll_directory(join(cuda_path, 'bin'))
+        os.add_dll_directory(join(cuda_path, "bin"))
         return
 
     nvcc_path = _search_on_path(["nvcc.exe"])
@@ -40,10 +35,13 @@ def _add_cuda_libdir_to_dll_path():
         os.add_dll_directory(dirname(nvcc_path))
 
     from warnings import warn
-    warn("Unable to discover CUDA installation directory "
-            "while attempting to add it to Python's DLL path. "
-            "Either set the 'CUDA_PATH' environment variable "
-            "or ensure that 'nvcc.exe' is on the path.")
+
+    warn(
+        "Unable to discover CUDA installation directory "
+        "while attempting to add it to Python's DLL path. "
+        "Either set the 'CUDA_PATH' environment variable "
+        "or ensure that 'nvcc.exe' is on the path."
+    )
 
 
 try:
@@ -63,18 +61,17 @@ try:
 except ImportError as e:
     if "_v2" in str(e):
         from warnings import warn
-        warn("Failed to import the CUDA driver interface, with an error "
-                "message indicating that the version of your CUDA header "
-                "does not match the version of your CUDA driver.")
+
+        warn(
+            "Failed to import the CUDA driver interface, with an error "
+            "message indicating that the version of your CUDA header "
+            "does not match the version of your CUDA driver."
+        )
     raise
 
 
-if sys.version_info >= (3,):
-    _memoryview = memoryview
-    _my_bytes = bytes
-else:
-    _memoryview = buffer
-    _my_bytes = str
+_memoryview = memoryview
+_my_bytes = bytes
 
 
 try:
@@ -83,7 +80,7 @@ except NameError:
     # Provide ManagedAllocationOrStub if not on CUDA 6.
     # This avoids having to do a version check in a high-traffic code path below.
 
-    class ManagedAllocationOrStub(object):
+    class ManagedAllocationOrStub:
         pass
 
 
@@ -117,7 +114,7 @@ class CompileError(Error):
         return result
 
 
-class ArgumentHandler(object):
+class ArgumentHandler:
     def __init__(self, ary):
         self.array = ary
         self.dev_alloc = None
@@ -127,7 +124,10 @@ class ArgumentHandler(object):
             try:
                 self.dev_alloc = mem_alloc_like(self.array)
             except AttributeError:
-                raise TypeError("could not determine array length of '%s': unsupported array type or not an array" % type(self.array))
+                raise TypeError(
+                    "could not determine array length of '%s': unsupported array type or not an array"
+                    % type(self.array)
+                )
         return self.dev_alloc
 
     def pre_call(self, stream):
@@ -155,7 +155,6 @@ class InOut(In, Out):
 
 
 def _add_functionality():
-
     def device_get_attributes(dev):
         result = {}
 
@@ -169,8 +168,11 @@ def _add_functionality():
                 att_value = dev.get_attribute(att_id)
             except LogicError as e:
                 from warnings import warn
-                warn("CUDA driver raised '%s' when querying '%s' on '%s'"
-                        % (e, att_name, dev))
+
+                warn(
+                    "CUDA driver raised '%s' when querying '%s' on '%s'"
+                    % (e, att_name, dev)
+                )
             else:
                 result[att_id] = att_value
 
@@ -216,6 +218,7 @@ def _add_functionality():
                     format += "P"
 
         from pycuda._pvt_struct import pack
+
         return handlers, pack(format, *arg_data)
 
     # {{{ pre-CUDA 4 call interface (stateful)
@@ -240,8 +243,8 @@ def _add_functionality():
 
         if kwargs:
             raise ValueError(
-                    "extra keyword arguments: %s"
-                    % (",".join(six.iterkeys(kwargs))))
+                "extra keyword arguments: %s" % (",".join(kwargs.keys()))
+            )
 
         if block is None:
             raise ValueError("must specify block size")
@@ -257,22 +260,23 @@ def _add_functionality():
         for texref in texrefs:
             func.param_set_texref(texref)
 
-        post_handlers = [handler
-                for handler in handlers
-                if hasattr(handler, "post_call")]
+        post_handlers = [
+            handler for handler in handlers if hasattr(handler, "post_call")
+        ]
 
         if stream is None:
             if time_kernel:
                 Context.synchronize()
 
                 from time import time
+
                 start_time = time()
             func._launch_grid(*grid)
             if post_handlers or time_kernel:
                 Context.synchronize()
 
                 if time_kernel:
-                    run_time = time()-start_time
+                    run_time = time() - start_time
 
                 for handler in post_handlers:
                     handler.post_call(stream)
@@ -280,25 +284,32 @@ def _add_functionality():
                 if time_kernel:
                     return run_time
         else:
-            assert not time_kernel, \
-                    "Can't time the kernel on an asynchronous invocation"
+            assert (
+                not time_kernel
+            ), "Can't time the kernel on an asynchronous invocation"
             func._launch_grid_async(grid[0], grid[1], stream)
 
             if post_handlers:
                 for handler in post_handlers:
                     handler.post_call(stream)
 
-    def function_prepare_pre_v4(func, arg_types, block=None,
-            shared=None, texrefs=[]):
+    def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=[]):
         from warnings import warn
+
         if block is not None:
-            warn("setting the block size in Function.prepare is deprecated",
-                    DeprecationWarning, stacklevel=2)
+            warn(
+                "setting the block size in Function.prepare is deprecated",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             func._set_block_shape(*block)
 
         if shared is not None:
-            warn("setting the shared memory size in Function.prepare is deprecated",
-                    DeprecationWarning, stacklevel=2)
+            warn(
+                "setting the shared memory size in Function.prepare is deprecated",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             func._set_shared_size(shared)
 
         func.texrefs = texrefs
@@ -306,8 +317,11 @@ def _add_functionality():
         func.arg_format = ""
 
         for i, arg_type in enumerate(arg_types):
-            if (isinstance(arg_type, type)
-                    and np is not None and np.number in arg_type.__mro__):
+            if (
+                isinstance(arg_type, type)
+                and np is not None
+                and np.number in arg_type.__mro__
+            ):
                 func.arg_format += np.dtype(arg_type).char
             elif isinstance(arg_type, str):
                 func.arg_format += arg_type
@@ -315,6 +329,7 @@ def _add_functionality():
                 func.arg_format += np.dtype(np.uintp).char
 
         from pycuda._pvt_struct import calcsize
+
         func._param_set_size(calcsize(func.arg_format))
 
         return func
@@ -324,8 +339,13 @@ def _add_functionality():
             func._set_block_shape(*block)
         else:
             from warnings import warn
-            warn("Not passing the block size to prepared_call is deprecated as of "
-                    "version 2011.1.", DeprecationWarning, stacklevel=2)
+
+            warn(
+                "Not passing the block size to prepared_call is deprecated as of "
+                "version 2011.1.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             args = (block,) + args
 
         shared_size = kwargs.pop("shared_size", None)
@@ -333,10 +353,12 @@ def _add_functionality():
             func._set_shared_size(shared_size)
 
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         func._param_setv(0, pack(func.arg_format, *args))
 
         for texref in func.texrefs:
@@ -349,9 +371,13 @@ def _add_functionality():
             func._set_block_shape(*block)
         else:
             from warnings import warn
-            warn("Not passing the block size to prepared_timed_call is "
-                    "deprecated as of version 2011.1.",
-                    DeprecationWarning, stacklevel=2)
+
+            warn(
+                "Not passing the block size to prepared_timed_call is "
+                "deprecated as of version 2011.1.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             args = (block,) + args
 
         shared_size = kwargs.pop("shared_size", None)
@@ -359,10 +385,12 @@ def _add_functionality():
             func._set_shared_size(shared_size)
 
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         func._param_setv(0, pack(func.arg_format, *args))
 
         for texref in func.texrefs:
@@ -377,19 +405,22 @@ def _add_functionality():
 
         def get_call_time():
             end.synchronize()
-            return end.time_since(start)*1e-3
+            return end.time_since(start) * 1e-3
 
         return get_call_time
 
-    def function_prepared_async_call_pre_v4(func, grid, block, stream,
-            *args, **kwargs):
+    def function_prepared_async_call_pre_v4(func, grid, block, stream, *args, **kwargs):
         if isinstance(block, tuple):
             func._set_block_shape(*block)
         else:
             from warnings import warn
-            warn("Not passing the block size to prepared_async_call is "
-                    "deprecated as of version 2011.1.",
-                    DeprecationWarning, stacklevel=2)
+
+            warn(
+                "Not passing the block size to prepared_async_call is "
+                "deprecated as of version 2011.1.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             args = (stream,) + args
             stream = block
 
@@ -398,10 +429,12 @@ def _add_functionality():
             func._set_shared_size(shared_size)
 
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         func._param_setv(0, pack(func.arg_format, *args))
 
         for texref in func.texrefs:
@@ -427,8 +460,8 @@ def _add_functionality():
 
         if kwargs:
             raise ValueError(
-                    "extra keyword arguments: %s"
-                    % (",".join(six.iterkeys(kwargs))))
+                "extra keyword arguments: %s" % (",".join(kwargs.keys()))
+            )
 
         if block is None:
             raise ValueError("must specify block size")
@@ -442,15 +475,16 @@ def _add_functionality():
         for texref in texrefs:
             func.param_set_texref(texref)
 
-        post_handlers = [handler
-                for handler in handlers
-                if hasattr(handler, "post_call")]
+        post_handlers = [
+            handler for handler in handlers if hasattr(handler, "post_call")
+        ]
 
         if stream is None:
             if time_kernel:
                 Context.synchronize()
 
                 from time import time
+
                 start_time = time()
 
             func._launch_kernel(grid, block, arg_buf, shared, None)
@@ -459,7 +493,7 @@ def _add_functionality():
                 Context.synchronize()
 
                 if time_kernel:
-                    run_time = time()-start_time
+                    run_time = time() - start_time
 
                 for handler in post_handlers:
                     handler.post_call(stream)
@@ -467,8 +501,9 @@ def _add_functionality():
                 if time_kernel:
                     return run_time
         else:
-            assert not time_kernel, \
-                    "Can't time the kernel on an asynchronous invocation"
+            assert (
+                not time_kernel
+            ), "Can't time the kernel on an asynchronous invocation"
             func._launch_kernel(grid, block, arg_buf, shared, stream)
 
             if post_handlers:
@@ -481,8 +516,7 @@ def _add_functionality():
         func.arg_format = ""
 
         for i, arg_type in enumerate(arg_types):
-            if (isinstance(arg_type, type)
-                    and np.number in arg_type.__mro__):
+            if isinstance(arg_type, type) and np.number in arg_type.__mro__:
                 func.arg_format += np.dtype(arg_type).char
             elif isinstance(arg_type, np.dtype):
                 if arg_type.char == "V":
@@ -501,17 +535,24 @@ def _add_functionality():
             func._set_block_shape(*block)
         else:
             from warnings import warn
-            warn("Not passing the block size to prepared_call is deprecated as of "
-                    "version 2011.1.", DeprecationWarning, stacklevel=2)
+
+            warn(
+                "Not passing the block size to prepared_call is deprecated as of "
+                "version 2011.1.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             args = (block,) + args
 
         shared_size = kwargs.pop("shared_size", 0)
 
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         arg_buf = pack(func.arg_format, *args)
 
         for texref in func.texrefs:
@@ -522,10 +563,12 @@ def _add_functionality():
     def function_prepared_timed_call(func, grid, block, *args, **kwargs):
         shared_size = kwargs.pop("shared_size", 0)
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         arg_buf = pack(func.arg_format, *args)
 
         for texref in func.texrefs:
@@ -540,7 +583,7 @@ def _add_functionality():
 
         def get_call_time():
             end.synchronize()
-            return end.time_since(start)*1e-3
+            return end.time_since(start) * 1e-3
 
         return get_call_time
 
@@ -549,19 +592,25 @@ def _add_functionality():
             func._set_block_shape(*block)
         else:
             from warnings import warn
-            warn("Not passing the block size to prepared_async_call is "
-                    "deprecated as of version 2011.1.",
-                    DeprecationWarning, stacklevel=2)
+
+            warn(
+                "Not passing the block size to prepared_async_call is "
+                "deprecated as of version 2011.1.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             args = (stream,) + args
             stream = block
 
         shared_size = kwargs.pop("shared_size", 0)
 
         if kwargs:
-            raise TypeError("unknown keyword arguments: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "unknown keyword arguments: " + ", ".join(kwargs.keys())
+            )
 
         from pycuda._pvt_struct import pack
+
         arg_buf = pack(func.arg_format, *args)
 
         for texref in func.texrefs:
@@ -587,9 +636,13 @@ def _add_functionality():
     def mark_func_method_deprecated(func):
         def new_func(*args, **kwargs):
             from warnings import warn
-            warn("'%s' has been deprecated in version 2011.1. Please use "
-                    "the stateless launch interface instead." % func.__name__[1:],
-                    DeprecationWarning, stacklevel=2)
+
+            warn(
+                "'%s' has been deprecated in version 2011.1. Please use "
+                "the stateless launch interface instead." % func.__name__[1:],
+                DeprecationWarning,
+                stacklevel=2,
+            )
             return func(*args, **kwargs)
 
         try:
@@ -599,7 +652,7 @@ def _add_functionality():
         else:
             try:
                 update_wrapper(new_func, func)
-            except:
+            except Exception:
                 # User won't see true signature. Oh well.
                 pass
 
@@ -622,12 +675,23 @@ def _add_functionality():
         Function.prepared_timed_call = function_prepared_timed_call_pre_v4
         Function.prepared_async_call = function_prepared_async_call_pre_v4
 
-        for meth_name in ["set_block_shape", "set_shared_size",
-                "param_set_size", "param_set", "param_seti", "param_setf",
-                "param_setv",
-                "launch", "launch_grid", "launch_grid_async"]:
-            setattr(Function, meth_name, mark_func_method_deprecated(
-                    getattr(Function, "_"+meth_name)))
+        for meth_name in [
+            "set_block_shape",
+            "set_shared_size",
+            "param_set_size",
+            "param_set",
+            "param_seti",
+            "param_setf",
+            "param_setv",
+            "launch",
+            "launch_grid",
+            "launch_grid_async",
+        ]:
+            setattr(
+                Function,
+                meth_name,
+                mark_func_method_deprecated(getattr(Function, "_" + meth_name)),
+            )
 
     Function.__getattr__ = function___getattr__
 
@@ -637,6 +701,7 @@ _add_functionality()
 
 # {{{ pagelocked numpy arrays
 
+
 def pagelocked_zeros(shape, dtype, order="C", mem_flags=0):
     result = pagelocked_empty(shape, dtype, order, mem_flags)
     result.fill(0)
@@ -659,11 +724,13 @@ def pagelocked_zeros_like(array, mem_flags=0):
     result.fill(0)
     return result
 
+
 # }}}
 
 
 # {{{ aligned numpy arrays
 
+
 def aligned_zeros(shape, dtype, order="C", alignment=4096):
     result = aligned_empty(shape, dtype, order, alignment)
     result.fill(0)
@@ -686,11 +753,13 @@ def aligned_zeros_like(array, alignment=4096):
     result.fill(0)
     return result
 
+
 # }}}
 
 
 # {{{ managed numpy arrays (CUDA Unified Memory)
 
+
 def managed_zeros(shape, dtype, order="C", mem_flags=0):
     result = managed_empty(shape, dtype, order, mem_flags)
     result.fill(0)
@@ -713,6 +782,7 @@ def managed_zeros_like(array, mem_flags=0):
     result.fill(0)
     return result
 
+
 # }}}
 
 
@@ -722,6 +792,7 @@ def mem_alloc_like(ary):
 
 # {{{ array handling
 
+
 def dtype_to_array_format(dtype):
     if dtype == np.uint8:
         return array_format.UNSIGNED_INT8
@@ -738,9 +809,7 @@ def dtype_to_array_format(dtype):
     elif dtype == np.float32:
         return array_format.FLOAT
     else:
-        raise TypeError(
-                "cannot convert dtype '%s' to array format"
-                % dtype)
+        raise TypeError("cannot convert dtype '%s' to array format" % dtype)
 
 
 def matrix_to_array(matrix, order, allow_double_hack=False):
@@ -771,22 +840,24 @@ def matrix_to_array(matrix, order, allow_double_hack=False):
     copy = Memcpy2D()
     copy.set_src_host(matrix)
     copy.set_dst_array(ary)
-    copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \
-            matrix.strides[stride]
+    copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = matrix.strides[stride]
     copy.height = h
     copy(aligned=True)
 
     return ary
 
-def np_to_array(nparray, order, allowSurfaceBind=False):
-    case = order in ["C","F"]
+
+def np_to_array(nparray, order, allowSurfaceBind=False):  # noqa: N803
+    case = order in ["C", "F"]
     if not case:
         raise LogicError("order must be either F or C")
 
     dimension = len(nparray.shape)
     if dimension == 2:
-        if order == "C": stride = 0
-        if order == "F": stride = -1
+        if order == "C":
+            stride = 0
+        if order == "F":
+            stride = -1
         h, w = nparray.shape
         d = 1
         if allowSurfaceBind:
@@ -799,35 +870,46 @@ def np_to_array(nparray, order, allowSurfaceBind=False):
             descrArr.width = w
             descrArr.height = h
     elif dimension == 3:
-        if order == "C": stride = 1
-        if order == "F": stride = 1
+        if order == "C":
+            stride = 1
+        if order == "F":
+            stride = 1
         d, h, w = nparray.shape
         descrArr = ArrayDescriptor3D()
         descrArr.width = w
         descrArr.height = h
         descrArr.depth = d
     else:
-        raise LogicError("CUDArrays dimensions 2 or 3 supported in CUDA at the moment ... ")
+        raise LogicError(
+            "CUDArrays dimensions 2 or 3 supported in CUDA at the moment ... "
+        )
 
     if nparray.dtype == np.complex64:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int2 (hi=re,lo=im) structure
         descrArr.num_channels = 2
     elif nparray.dtype == np.float64:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int2 (hi,lo) structure
         descrArr.num_channels = 2
     elif nparray.dtype == np.complex128:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
         descrArr.num_channels = 4
     else:
         descrArr.format = dtype_to_array_format(nparray.dtype)
         descrArr.num_channels = 1
 
     if allowSurfaceBind:
-        if dimension==2:  descrArr.flags |= array3d_flags.ARRAY3D_LAYERED
+        if dimension == 2:
+            descrArr.flags |= array3d_flags.ARRAY3D_LAYERED
         descrArr.flags |= array3d_flags.SURFACE_LDST
 
     cudaArray = Array(descrArr)
-    if allowSurfaceBind or dimension==3:
+    if allowSurfaceBind or dimension == 3:
         copy3D = Memcpy3D()
         copy3D.set_src_host(nparray)
         copy3D.set_dst_array(cudaArray)
@@ -845,15 +927,18 @@ def np_to_array(nparray, order, allowSurfaceBind=False):
         copy2D(aligned=True)
         return cudaArray
 
-def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False):
-    case = order in ["C","F"]
+
+def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False):  # noqa: N803
+    case = order in ["C", "F"]
     if not case:
         raise LogicError("order must be either F or C")
 
     dimension = len(gpuarray.shape)
     if dimension == 2:
-        if order == "C": stride = 0
-        if order == "F": stride = -1
+        if order == "C":
+            stride = 0
+        if order == "F":
+            stride = -1
         h, w = gpuarray.shape
         d = 1
         if allowSurfaceBind:
@@ -866,35 +951,46 @@ def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False):
             descrArr.width = int(w)
             descrArr.height = int(h)
     elif dimension == 3:
-        if order == "C": stride = 1
-        if order == "F": stride = 1
+        if order == "C":
+            stride = 1
+        if order == "F":
+            stride = 1
         d, h, w = gpuarray.shape
         descrArr = ArrayDescriptor3D()
         descrArr.width = int(w)
         descrArr.height = int(h)
         descrArr.depth = int(d)
     else:
-        raise LogicError("CUDArray dimensions 2 and 3 supported in CUDA at the moment ... ")
+        raise LogicError(
+            "CUDArray dimensions 2 and 3 supported in CUDA at the moment ... "
+        )
 
     if gpuarray.dtype == np.complex64:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int2 (hi=re,lo=im) structure
         descrArr.num_channels = 2
     elif gpuarray.dtype == np.float64:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int2 (hi,lo) structure
         descrArr.num_channels = 2
     elif gpuarray.dtype == np.complex128:
-        descrArr.format = array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
+        descrArr.format = (
+            array_format.SIGNED_INT32
+        )  # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
         descrArr.num_channels = 4
     else:
         descrArr.format = dtype_to_array_format(gpuarray.dtype)
         descrArr.num_channels = 1
 
     if allowSurfaceBind:
-        if dimension==2:  descrArr.flags |= array3d_flags.ARRAY3D_LAYERED
+        if dimension == 2:
+            descrArr.flags |= array3d_flags.ARRAY3D_LAYERED
         descrArr.flags |= array3d_flags.SURFACE_LDST
 
     cudaArray = Array(descrArr)
-    if allowSurfaceBind or dimension==3:
+    if allowSurfaceBind or dimension == 3:
         copy3D = Memcpy3D()
         copy3D.set_src_device(gpuarray.ptr)
         copy3D.set_dst_array(cudaArray)
@@ -912,6 +1008,7 @@ def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False):
         copy2D(aligned=True)
         return cudaArray
 
+
 def make_multichannel_2d_array(ndarray, order):
     """Channel count has to be the first dimension of the C{ndarray}."""
 
@@ -936,8 +1033,7 @@ def make_multichannel_2d_array(ndarray, order):
     copy = Memcpy2D()
     copy.set_src_host(ndarray)
     copy.set_dst_array(ary)
-    copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \
-            ndarray.strides[stride]
+    copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = ndarray.strides[stride]
     copy.height = h
     copy(aligned=True)
 
@@ -950,6 +1046,7 @@ def bind_array_to_texref(ary, texref):
     texref.set_address_mode(1, address_mode.CLAMP)
     texref.set_filter_mode(filter_mode.POINT)
 
+
 # }}}
 
 
@@ -959,8 +1056,10 @@ def matrix_to_texref(matrix, texref, order):
 
 # {{{ device copies
 
+
 def to_device(bf_obj):
     import sys
+
     if sys.version_info >= (2, 7):
         bf = memoryview(bf_obj).tobytes()
     else:
@@ -981,6 +1080,7 @@ def from_device_like(devptr, other_ary):
     memcpy_dtoh(result, devptr)
     return result
 
+
 # }}}
 
 # vim: fdm=marker
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index ab451ade174fe31c94a8789dd401dfef9fd1ff63..e6dd5a9223dd299f8acbfdf5307a7ef06e321f65 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -1,11 +1,5 @@
 """Elementwise functionality."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
-
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -38,11 +32,20 @@ from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
 
 
-def get_elwise_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
+def get_elwise_module(
+    arguments,
+    operation,
+    name="kernel",
+    keep=False,
+    options=None,
+    preamble="",
+    loop_prep="",
+    after_loop="",
+):
     from pycuda.compiler import SourceModule
-    return SourceModule("""
+
+    return SourceModule(
+        """
         #include <pycuda-complex.hpp>
 
         %(preamble)s
@@ -65,22 +68,35 @@ def get_elwise_module(arguments, operation,
 
           %(after_loop)s;
         }
-        """ % {
+        """
+        % {
             "arguments": ", ".join(arg.declarator() for arg in arguments),
             "operation": operation,
             "name": name,
             "preamble": preamble,
             "loop_prep": loop_prep,
             "after_loop": after_loop,
-            },
-        options=options, keep=keep, no_extern_c=True)
-
-
-def get_elwise_range_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
+        },
+        options=options,
+        keep=keep,
+        no_extern_c=True,
+    )
+
+
+def get_elwise_range_module(
+    arguments,
+    operation,
+    name="kernel",
+    keep=False,
+    options=None,
+    preamble="",
+    loop_prep="",
+    after_loop="",
+):
     from pycuda.compiler import SourceModule
-    return SourceModule("""
+
+    return SourceModule(
+        """
         #include <pycuda-complex.hpp>
 
         %(preamble)s
@@ -114,29 +130,43 @@ def get_elwise_range_module(arguments, operation,
 
           %(after_loop)s;
         }
-        """ % {
+        """
+        % {
             "arguments": ", ".join(arg.declarator() for arg in arguments),
             "operation": operation,
             "name": name,
             "preamble": preamble,
             "loop_prep": loop_prep,
             "after_loop": after_loop,
-            },
-        options=options, keep=keep, no_extern_c=True)
-
-
-def get_elwise_kernel_and_types(arguments, operation,
-        name="kernel", keep=False, options=None, use_range=False, **kwargs):
+        },
+        options=options,
+        keep=keep,
+        no_extern_c=True,
+    )
+
+
+def get_elwise_kernel_and_types(
+    arguments,
+    operation,
+    name="kernel",
+    keep=False,
+    options=None,
+    use_range=False,
+    **kwargs
+):
     if isinstance(arguments, str):
         from pycuda.tools import parse_c_arg
+
         arguments = [parse_c_arg(arg) for arg in arguments.split(",")]
 
     if use_range:
-        arguments.extend([
-            ScalarArg(np.intp, "start"),
-            ScalarArg(np.intp, "stop"),
-            ScalarArg(np.intp, "step"),
-            ])
+        arguments.extend(
+            [
+                ScalarArg(np.intp, "start"),
+                ScalarArg(np.intp, "stop"),
+                ScalarArg(np.intp, "step"),
+            ]
+        )
     else:
         arguments.append(ScalarArg(np.uintp, "n"))
 
@@ -145,8 +175,7 @@ def get_elwise_kernel_and_types(arguments, operation,
     else:
         module_builder = get_elwise_module
 
-    mod = module_builder(arguments, operation, name,
-            keep, options, **kwargs)
+    mod = module_builder(arguments, operation, name, keep, options, **kwargs)
 
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
@@ -154,24 +183,34 @@ def get_elwise_kernel_and_types(arguments, operation,
     return mod, func, arguments
 
 
-def get_elwise_kernel(arguments, operation,
-        name="kernel", keep=False, options=None, **kwargs):
+def get_elwise_kernel(
+    arguments, operation, name="kernel", keep=False, options=None, **kwargs
+):
     """Return a L{pycuda.driver.Function} that performs the same scalar operation
     on one or several vectors.
     """
     mod, func, arguments = get_elwise_kernel_and_types(
-            arguments, operation, name, keep, options, **kwargs)
+        arguments, operation, name, keep, options, **kwargs
+    )
 
     return func
 
 
 class ElementwiseKernel:
-    def __init__(self, arguments, operation,
-            name="kernel", keep=False, options=None, **kwargs):
+    def __init__(
+        self, arguments, operation, name="kernel", keep=False, options=None, **kwargs
+    ):
 
         self.gen_kwargs = kwargs.copy()
-        self.gen_kwargs.update(dict(keep=keep, options=options, name=name,
-            operation=operation, arguments=arguments))
+        self.gen_kwargs.update(
+            dict(
+                keep=keep,
+                options=options,
+                name=name,
+                operation=operation,
+                arguments=arguments,
+            )
+        )
 
     def get_texref(self, name, use_range=False):
         mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range)
@@ -179,13 +218,14 @@ class ElementwiseKernel:
 
     @memoize_method
     def generate_stride_kernel_and_types(self, use_range):
-        mod, knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
-                **self.gen_kwargs)
+        mod, knl, arguments = get_elwise_kernel_and_types(
+            use_range=use_range, **self.gen_kwargs
+        )
 
-        assert [i for i, arg in enumerate(arguments)
-                if isinstance(arg, VectorArg)], \
-                "ElementwiseKernel can only be used with functions that " \
-                "have at least one vector argument"
+        assert [i for i, arg in enumerate(arguments) if isinstance(arg, VectorArg)], (
+            "ElementwiseKernel can only be used with functions that "
+            "have at least one vector argument"
+        )
 
         return mod, knl, arguments
 
@@ -197,18 +237,22 @@ class ElementwiseKernel:
         stream = kwargs.pop("stream", None)
 
         if kwargs:
-            raise TypeError("invalid keyword arguments specified: "
-                    + ", ".join(six.iterkeys(kwargs)))
+            raise TypeError(
+                "invalid keyword arguments specified: "
+                + ", ".join(kwargs.keys())
+            )
 
         invocation_args = []
         mod, func, arguments = self.generate_stride_kernel_and_types(
-                range_ is not None or slice_ is not None)
+            range_ is not None or slice_ is not None
+        )
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
                 if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
+                    raise RuntimeError(
+                        "elementwise kernel cannot " "deal with non-contiguous arrays"
+                    )
 
                 vectors.append(arg)
                 invocation_args.append(arg.gpudata)
@@ -219,8 +263,9 @@ class ElementwiseKernel:
 
         if slice_ is not None:
             if range_ is not None:
-                raise TypeError("may not specify both range and slice "
-                        "keyword arguments")
+                raise TypeError(
+                    "may not specify both range and slice " "keyword arguments"
+                )
 
             range_ = slice(*slice_.indices(repr_vec.size))
 
@@ -233,7 +278,8 @@ class ElementwiseKernel:
                 invocation_args.append(range_.step)
 
             from pycuda.gpuarray import splay
-            grid, block = splay(abs(range_.stop - range_.start)//range_.step)
+
+            grid, block = splay(abs(range_.stop - range_.start) // range_.step)
         else:
             block = repr_vec._block
             grid = repr_vec._grid
@@ -245,124 +291,133 @@ class ElementwiseKernel:
 @context_dependent_memoize
 def get_take_kernel(dtype, idx_dtype, vec_count=1):
     ctx = {
-            "idx_tp": dtype_to_ctype(idx_dtype),
-            "tp": dtype_to_ctype(dtype),
-            "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
-            }
-
-    args = [VectorArg(idx_dtype, "idx")] + [
-            VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [
-                ScalarArg(np.intp, "n")
-            ]
+        "idx_tp": dtype_to_ctype(idx_dtype),
+        "tp": dtype_to_ctype(dtype),
+        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
+    }
+
+    args = (
+        [VectorArg(idx_dtype, "idx")]
+        + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)]
+        + [ScalarArg(np.intp, "n")]
+    )
     preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
         "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
-        for i in range(vec_count))
-    body = (
-            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
-            + "\n".join(
-                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
-                for i in range(vec_count)))
+        for i in range(vec_count)
+    )
+    body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join(
+        "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
+        for i in range(vec_count)
+    )
 
     mod = get_elwise_module(args, body, "take", preamble=preamble)
     func = mod.get_function("take")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
-    func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src)
+    func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src)
     return func, tex_src
 
 
 @context_dependent_memoize
 def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1):
     ctx = {
-            "idx_tp": dtype_to_ctype(idx_dtype),
-            "tp": dtype_to_ctype(dtype),
-            "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
-            }
+        "idx_tp": dtype_to_ctype(idx_dtype),
+        "tp": dtype_to_ctype(dtype),
+        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
+    }
 
-    args = [
-                VectorArg(idx_dtype, "gmem_dest_idx"),
-                VectorArg(idx_dtype, "gmem_src_idx"),
-            ] + [
-                VectorArg(dtype, "dest%d" % i)
-                for i in range(vec_count)
-            ] + [
-                ScalarArg(idx_dtype, "offset%d" % i)
-                for i in range(vec_count) if with_offsets
-            ] + [ScalarArg(np.intp, "n")]
+    args = (
+        [
+            VectorArg(idx_dtype, "gmem_dest_idx"),
+            VectorArg(idx_dtype, "gmem_src_idx"),
+        ]
+        + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
+        + [
+            ScalarArg(idx_dtype, "offset%d" % i)
+            for i in range(vec_count)
+            if with_offsets
+        ]
+        + [ScalarArg(np.intp, "n")]
+    )
 
     preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
         "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
-        for i in range(vec_count))
+        for i in range(vec_count)
+    )
 
     if with_offsets:
+
         def get_copy_insn(i):
-            return ("dest%d[dest_idx] = "
-                    "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);"
-                    % (i, i, i))
+            return (
+                "dest%d[dest_idx] = "
+                "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i)
+            )
+
     else:
+
         def get_copy_insn(i):
-            return ("dest%d[dest_idx] = "
-                    "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i))
+            return "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
 
-    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
-                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
-            + "\n".join(get_copy_insn(i) for i in range(vec_count)))
+    body = (
+        "%(idx_tp)s src_idx = gmem_src_idx[i];\n"
+        "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
+    ) + "\n".join(get_copy_insn(i) for i in range(vec_count))
 
     mod = get_elwise_module(args, body, "take_put", preamble=preamble)
     func = mod.get_function("take_put")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
 
     func.prepare(
-            "PP"+(vec_count*"P")
-            + (bool(with_offsets)*vec_count*idx_dtype.char)
-            + np.dtype(np.uintp).char,
-            texrefs=tex_src)
+        "PP"
+        + (vec_count * "P")
+        + (bool(with_offsets) * vec_count * idx_dtype.char)
+        + np.dtype(np.uintp).char,
+        texrefs=tex_src,
+    )
     return func, tex_src
 
 
 @context_dependent_memoize
 def get_put_kernel(dtype, idx_dtype, vec_count=1):
     ctx = {
-            "idx_tp": dtype_to_ctype(idx_dtype),
-            "tp": dtype_to_ctype(dtype),
-            }
+        "idx_tp": dtype_to_ctype(idx_dtype),
+        "tp": dtype_to_ctype(dtype),
+    }
 
-    args = [
+    args = (
+        [
             VectorArg(idx_dtype, "gmem_dest_idx"),
-            ] + [
-                VectorArg(dtype, "dest%d" % i)
-                for i in range(vec_count)
-            ] + [
-                VectorArg(dtype, "src%d" % i)
-                for i in range(vec_count)
-            ] + [ScalarArg(np.intp, "n")]
+        ]
+        + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
+        + [VectorArg(dtype, "src%d" % i) for i in range(vec_count)]
+        + [ScalarArg(np.intp, "n")]
+    )
 
-    body = (
-            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
-                for i in range(vec_count)))
+    body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join(
+        "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count)
+    )
 
     func = get_elwise_module(args, body, "put").get_function("put")
-    func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
+    func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char)
     return func
 
 
 @context_dependent_memoize
 def get_copy_kernel(dtype_dest, dtype_src):
     return get_elwise_kernel(
-            "%(tp_dest)s *dest, %(tp_src)s *src" % {
-                "tp_dest": dtype_to_ctype(dtype_dest),
-                "tp_src": dtype_to_ctype(dtype_src),
-                },
-            "dest[i] = src[i]",
-            "copy")
+        "%(tp_dest)s *dest, %(tp_src)s *src"
+        % {
+            "tp_dest": dtype_to_ctype(dtype_dest),
+            "tp_src": dtype_to_ctype(dtype_src),
+        },
+        "dest[i] = src[i]",
+        "copy",
+    )
 
 
 @context_dependent_memoize
-def get_linear_combination_kernel(summand_descriptors,
-        dtype_z):
+def get_linear_combination_kernel(summand_descriptors, dtype_z):
     from pycuda.tools import dtype_to_ctype
-    from pycuda.elementwise import \
-            VectorArg, ScalarArg, get_elwise_module
+    from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module
 
     args = []
     preamble = ["#include <pycuda-helpers.hpp>\n\n"]
@@ -370,17 +425,20 @@ def get_linear_combination_kernel(summand_descriptors,
     summands = []
     tex_names = []
 
-    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
-            enumerate(summand_descriptors):
+    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in enumerate(
+        summand_descriptors
+    ):
         if is_gpu_scalar:
             preamble.append(
-                    "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
-                    % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
+                "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
+                % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)
+            )
             args.append(VectorArg(vector_dtype, "x%d" % i))
             tex_names.append("tex_a%d" % i)
             loop_prep.append(
-                    "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
-                    % (dtype_to_ctype(scalar_dtype), i, i))
+                "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
+                % (dtype_to_ctype(scalar_dtype), i, i)
+            )
         else:
             args.append(ScalarArg(scalar_dtype, "a%d" % i))
             args.append(VectorArg(vector_dtype, "x%d" % i))
@@ -390,16 +448,17 @@ def get_linear_combination_kernel(summand_descriptors,
     args.append(VectorArg(dtype_z, "z"))
     args.append(ScalarArg(np.uintp, "n"))
 
-    mod = get_elwise_module(args,
-            "z[i] = " + " + ".join(summands),
-            "linear_combination",
-            preamble="\n".join(preamble),
-            loop_prep=";\n".join(loop_prep))
+    mod = get_elwise_module(
+        args,
+        "z[i] = " + " + ".join(summands),
+        "linear_combination",
+        preamble="\n".join(preamble),
+        loop_prep=";\n".join(loop_prep),
+    )
 
     func = mod.get_function("linear_combination")
     tex_src = [mod.get_texref(tn) for tn in tex_names]
-    func.prepare("".join(arg.struct_char for arg in args),
-            texrefs=tex_src)
+    func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src)
 
     return func, tex_src
 
@@ -407,71 +466,80 @@ def get_linear_combination_kernel(summand_descriptors,
 @context_dependent_memoize
 def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(
-            "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = a*x[i] + b*y[i]",
-            "axpbyz")
+        "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = a*x[i] + b*y[i]",
+        "axpbyz",
+    )
 
 
 @context_dependent_memoize
 def get_axpbz_kernel(dtype_x, dtype_z):
     return get_elwise_kernel(
-            "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_z": dtype_to_ctype(dtype_z)
-                },
-            "z[i] = a * x[i] + b",
-            "axpb")
+        "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z"
+        % {"tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z)},
+        "z[i] = a * x[i] + b",
+        "axpb",
+    )
 
 
 @context_dependent_memoize
 def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = x[i] %s y[i]" % operator,
-            "multiply")
+        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = x[i] %s y[i]" % operator,
+        "multiply",
+    )
 
 
 @context_dependent_memoize
 def get_rdivide_elwise_kernel(dtype_x, dtype_z):
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = y / x[i]",
-            "divide_r")
+        "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = y / x[i]",
+        "divide_r",
+    )
 
 
 @context_dependent_memoize
 def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = %s(x[i], y[i])" % func,
-            func+"_kernel")
+        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s(x[i], y[i])" % func,
+        func + "_kernel",
+    )
 
 
 @context_dependent_memoize
 def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = %s(x[i], y)" % func,
-            func+"_kernel")
+        "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s(x[i], y)" % func,
+        func + "_kernel",
+    )
 
 
 def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
@@ -479,8 +547,9 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
         func = func + "f"
 
     from pytools import any
+
     if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]):
-        func = "f"+func
+        func = "f" + func
 
     if use_scalar:
         return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z)
@@ -491,63 +560,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
 @context_dependent_memoize
 def get_fill_kernel(dtype):
     return get_elwise_kernel(
-            "%(tp)s a, %(tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                },
-            "z[i] = a",
-            "fill")
+        "%(tp)s a, %(tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        "z[i] = a",
+        "fill",
+    )
 
 
 @context_dependent_memoize
 def get_reverse_kernel(dtype):
     return get_elwise_kernel(
-            "%(tp)s *y, %(tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                },
-            "z[i] = y[n-1-i]",
-            "reverse")
+        "%(tp)s *y, %(tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        "z[i] = y[n-1-i]",
+        "reverse",
+    )
 
 
 @context_dependent_memoize
 def get_real_kernel(dtype, real_dtype):
     return get_elwise_kernel(
-            "%(tp)s *y, %(real_tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                "real_tp": dtype_to_ctype(real_dtype),
-                },
-            "z[i] = real(y[i])",
-            "real")
+        "%(tp)s *y, %(real_tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+            "real_tp": dtype_to_ctype(real_dtype),
+        },
+        "z[i] = real(y[i])",
+        "real",
+    )
 
 
 @context_dependent_memoize
 def get_imag_kernel(dtype, real_dtype):
     return get_elwise_kernel(
-            "%(tp)s *y, %(real_tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                "real_tp": dtype_to_ctype(real_dtype),
-                },
-            "z[i] = imag(y[i])",
-            "imag")
+        "%(tp)s *y, %(real_tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+            "real_tp": dtype_to_ctype(real_dtype),
+        },
+        "z[i] = imag(y[i])",
+        "imag",
+    )
 
 
 @context_dependent_memoize
 def get_conj_kernel(dtype):
     return get_elwise_kernel(
-            "%(tp)s *y, %(tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                },
-            "z[i] = pycuda::conj(y[i])",
-            "conj")
+        "%(tp)s *y, %(tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        "z[i] = pycuda::conj(y[i])",
+        "conj",
+    )
 
 
 @context_dependent_memoize
 def get_arange_kernel(dtype):
     return get_elwise_kernel(
-            "%(tp)s *z, %(tp)s start, %(tp)s step" % {
-                "tp": dtype_to_ctype(dtype),
-                },
-            "z[i] = start + i*step",
-            "arange")
+        "%(tp)s *z, %(tp)s start, %(tp)s step"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        "z[i] = start + i*step",
+        "arange",
+    )
 
 
 @context_dependent_memoize
@@ -558,11 +639,13 @@ def get_pow_kernel(dtype):
         func = "pow"
 
     return get_elwise_kernel(
-            "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
-                "tp": dtype_to_ctype(dtype),
-                },
-            "z[i] = %s(y[i], value)" % func,
-            "pow_method")
+        "%(tp)s value, %(tp)s *y, %(tp)s *z"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        "z[i] = %s(y[i], value)" % func,
+        "pow_method",
+    )
 
 
 @context_dependent_memoize
@@ -573,49 +656,53 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
         func = "powf"
 
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_z": dtype_to_ctype(dtype_z),
-                },
-            "z[i] = %s(x[i], y[i])" % func,
-            "pow_method")
+        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s(x[i], y[i])" % func,
+        "pow_method",
+    )
 
 
 @context_dependent_memoize
 def get_fmod_kernel():
     return get_elwise_kernel(
-            "float *arg, float *mod, float *z",
-            "z[i] = fmod(arg[i], mod[i])",
-            "fmod_kernel")
+        "float *arg, float *mod, float *z", "z[i] = fmod(arg[i], mod[i])", "fmod_kernel"
+    )
 
 
 @context_dependent_memoize
 def get_modf_kernel():
     return get_elwise_kernel(
-            "float *x, float *intpart ,float *fracpart",
-            "fracpart[i] = modf(x[i], &intpart[i])",
-            "modf_kernel")
+        "float *x, float *intpart ,float *fracpart",
+        "fracpart[i] = modf(x[i], &intpart[i])",
+        "modf_kernel",
+    )
 
 
 @context_dependent_memoize
 def get_frexp_kernel():
     return get_elwise_kernel(
-            "float *x, float *significand, float *exponent",
-            """
+        "float *x, float *significand, float *exponent",
+        """
                 int expt = 0;
                 significand[i] = frexp(x[i], &expt);
                 exponent[i] = expt;
             """,
-            "frexp_kernel")
+        "frexp_kernel",
+    )
 
 
 @context_dependent_memoize
 def get_ldexp_kernel():
     return get_elwise_kernel(
-            "float *sig, float *expt, float *z",
-            "z[i] = ldexp(sig[i], int(expt[i]))",
-            "ldexp_kernel")
+        "float *sig, float *expt, float *z",
+        "z[i] = ldexp(sig[i], int(expt[i]))",
+        "ldexp_kernel",
+    )
 
 
 @context_dependent_memoize
@@ -624,33 +711,39 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
         out_dtype = in_dtype
 
     return get_elwise_kernel(
-            "%(tp_in)s *y, %(tp_out)s *z" % {
-                "tp_in": dtype_to_ctype(in_dtype),
-                "tp_out": dtype_to_ctype(out_dtype),
-                },
-            "z[i] = %s(y[i])" % func_name,
-            "%s_kernel" % func_name)
+        "%(tp_in)s *y, %(tp_out)s *z"
+        % {
+            "tp_in": dtype_to_ctype(in_dtype),
+            "tp_out": dtype_to_ctype(out_dtype),
+        },
+        "z[i] = %s(y[i])" % func_name,
+        "%s_kernel" % func_name,
+    )
 
 
 @context_dependent_memoize
 def get_if_positive_kernel(crit_dtype, dtype):
-    return get_elwise_kernel([
+    return get_elwise_kernel(
+        [
             VectorArg(crit_dtype, "crit"),
             VectorArg(dtype, "then_"),
             VectorArg(dtype, "else_"),
             VectorArg(dtype, "result"),
-            ],
-            "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
-            "if_positive")
+        ],
+        "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
+        "if_positive",
+    )
 
 
 @context_dependent_memoize
 def get_scalar_op_kernel(dtype_x, dtype_y, operator):
     return get_elwise_kernel(
-            "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_a": dtype_to_ctype(dtype_x),
-                },
-            "y[i] = x[i] %s a" % operator,
-            "scalarop_kernel")
+        "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_a": dtype_to_ctype(dtype_x),
+        },
+        "y[i] = x[i] %s a" % operator,
+        "scalarop_kernel",
+    )
diff --git a/pycuda/gl/__init__.py b/pycuda/gl/__init__.py
index 56c0e642f28b3fbfa7f37b02196faf88f16bb4ab..3120d2de74f72fc4b8eddc54ea2b82475d3e259e 100644
--- a/pycuda/gl/__init__.py
+++ b/pycuda/gl/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 import pycuda._driver as _drv
 
-if not _drv.have_gl_ext(): 
+if not _drv.have_gl_ext():
     raise ImportError("PyCUDA was compiled without GL extension support")
 
 init = _drv.gl_init
diff --git a/pycuda/gl/autoinit.py b/pycuda/gl/autoinit.py
index 13f6717d8720670a2ca0453e9ee86f22f5741b2b..fc8c9a668f365ff5aa2be3c89cc97413c634f87a 100644
--- a/pycuda/gl/autoinit.py
+++ b/pycuda/gl/autoinit.py
@@ -1,13 +1,13 @@
 from __future__ import absolute_import
 import pycuda.driver as cuda
 import pycuda.gl as cudagl
+import atexit
 
 cuda.init()
 assert cuda.Device.count() >= 1
 
-from pycuda.tools import make_default_context
+from pycuda.tools import make_default_context  # noqa: E402
 context = make_default_context(lambda dev: cudagl.make_context(dev))
 device = context.get_device()
 
-import atexit
 atexit.register(context.pop)
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 12e4e0391311058d05468eb9d24996ddd569a6ea..97630e3bdeb572e2d881deeea840d1ff1bce3cc0 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1,20 +1,20 @@
-from __future__ import division, absolute_import
-
 import numpy as np
 import pycuda.elementwise as elementwise
 from pytools import memoize, memoize_method
 import pycuda.driver as drv
 from pycuda.compyte.array import (
-        as_strided as _as_strided,
-        f_contiguous_strides as _f_contiguous_strides,
-        c_contiguous_strides as _c_contiguous_strides,
-        ArrayFlags as _ArrayFlags,
-        get_common_dtype as _get_common_dtype_base)
+    as_strided as _as_strided,
+    f_contiguous_strides as _f_contiguous_strides,
+    c_contiguous_strides as _c_contiguous_strides,
+    ArrayFlags as _ArrayFlags,
+    get_common_dtype as _get_common_dtype_base,
+)
 from pycuda.characterize import has_double_support
-import six
-from six.moves import range, zip, reduce
+from functools import reduce
 import numbers
 
+import copyreg
+
 
 def _get_common_dtype(obj1, obj2):
     return _get_common_dtype_base(obj1, obj2, has_double_support())
@@ -22,12 +22,14 @@ def _get_common_dtype(obj1, obj2):
 
 # {{{ vector types
 
+
 class vec:  # noqa
     pass
 
 
 def _create_vector_types():
     from pycuda.characterize import platform_bits
+
     if platform_bits() == 32:
         long_dtype = np.int32
         ulong_dtype = np.uint32
@@ -40,35 +42,40 @@ def _create_vector_types():
     from pycuda.tools import get_or_register_dtype
 
     for base_name, base_type, counts in [
-            ('char', np.int8, [1, 2, 3, 4]),
-            ('uchar', np.uint8, [1, 2, 3, 4]),
-            ('short', np.int16, [1, 2, 3, 4]),
-            ('ushort', np.uint16, [1, 2, 3, 4]),
-            ('int', np.int32, [1, 2, 3, 4]),
-            ('uint', np.uint32, [1, 2, 3, 4]),
-            ('long', long_dtype, [1, 2, 3, 4]),
-            ('ulong', ulong_dtype, [1, 2, 3, 4]),
-            ('longlong', np.int64, [1, 2]),
-            ('ulonglong', np.uint64, [1, 2]),
-            ('float', np.float32, [1, 2, 3, 4]),
-            ('double', np.float64, [1, 2]),
-            ]:
+        ("char", np.int8, [1, 2, 3, 4]),
+        ("uchar", np.uint8, [1, 2, 3, 4]),
+        ("short", np.int16, [1, 2, 3, 4]),
+        ("ushort", np.uint16, [1, 2, 3, 4]),
+        ("int", np.int32, [1, 2, 3, 4]),
+        ("uint", np.uint32, [1, 2, 3, 4]),
+        ("long", long_dtype, [1, 2, 3, 4]),
+        ("ulong", ulong_dtype, [1, 2, 3, 4]),
+        ("longlong", np.int64, [1, 2]),
+        ("ulonglong", np.uint64, [1, 2]),
+        ("float", np.float32, [1, 2, 3, 4]),
+        ("double", np.float64, [1, 2]),
+    ]:
         for count in counts:
             name = "%s%d" % (base_name, count)
-            dtype = np.dtype([
-                (field_names[i], base_type)
-                for i in range(count)])
+            dtype = np.dtype([(field_names[i], base_type) for i in range(count)])
 
             get_or_register_dtype(name, dtype)
 
             setattr(vec, name, dtype)
 
             my_field_names = ",".join(field_names[:count])
-            setattr(vec, "make_"+name,
-                    staticmethod(eval(
+            setattr(
+                vec,
+                "make_" + name,
+                staticmethod(
+                    eval(
                         "lambda %s: array((%s), dtype=my_dtype)"
                         % (my_field_names, my_field_names),
-                        dict(array=np.array, my_dtype=dtype))))
+                        dict(array=np.array, my_dtype=dtype),
+                    )
+                ),
+            )
+
 
 _create_vector_types()
 
@@ -77,16 +84,21 @@ _create_vector_types()
 
 # {{{ helper functionality
 
+
 @memoize
 def _splay_backend(n, dev):
     # heavily modified from cublas
     from pycuda.tools import DeviceData
+
     devdata = DeviceData(dev)
 
     min_threads = devdata.warp_size
     max_threads = 128
-    max_blocks = 4 * devdata.thread_blocks_per_mp \
-            * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
+    max_blocks = (
+        4
+        * devdata.thread_blocks_per_mp
+        * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
+    )
 
     if n < min_threads:
         block_count = 1
@@ -111,46 +123,62 @@ def splay(n, dev=None):
         dev = drv.Context.get_device()
     return _splay_backend(n, dev)
 
+
 # }}}
 
 
 # {{{ main GPUArray class
 
+
 def _make_binary_op(operator):
     def func(self, other):
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
             if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+                raise RuntimeError(
+                    "only contiguous arrays may "
+                    "be used as arguments to this operation"
+                )
 
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
-                    self.dtype, other.dtype, result.dtype,
-                    operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+                self.dtype, other.dtype, result.dtype, operator
+            )
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                other.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
         else:  # scalar operator
             result = self._new_like_me()
-            func = elementwise.get_scalar_op_kernel(
-                    self.dtype, result.dtype, operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
-                    self.mem_size)
+            func = elementwise.get_scalar_op_kernel(self.dtype, result.dtype, operator)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                other,
+                result.gpudata,
+                self.mem_size,
+            )
             return result
 
     return func
 
 
-class GPUArray(object):
+class GPUArray:
     """A GPUArray is used to do array-based calculation on the GPU.
 
     This is mostly supposed to be a numpy-workalike. Operators
@@ -159,8 +187,16 @@ class GPUArray(object):
 
     __array_priority__ = 100
 
-    def __init__(self, shape, dtype, allocator=drv.mem_alloc,
-            base=None, gpudata=None, strides=None, order="C"):
+    def __init__(
+        self,
+        shape,
+        dtype,
+        allocator=drv.mem_alloc,
+        base=None,
+        gpudata=None,
+        strides=None,
+        order="C",
+    ):
         dtype = np.dtype(dtype)
 
         try:
@@ -184,11 +220,9 @@ class GPUArray(object):
 
         if strides is None:
             if order == "F":
-                strides = _f_contiguous_strides(
-                        dtype.itemsize, shape)
+                strides = _f_contiguous_strides(dtype.itemsize, shape)
             elif order == "C":
-                strides = _c_contiguous_strides(
-                        dtype.itemsize, shape)
+                strides = _c_contiguous_strides(dtype.itemsize, shape)
             else:
                 raise ValueError("invalid order: %s" % order)
         else:
@@ -240,8 +274,7 @@ class GPUArray(object):
             async_ = False
 
         if kwargs:
-            raise TypeError("extra keyword arguments specified: %s"
-                    % ", ".join(kwargs))
+            raise TypeError("extra keyword arguments specified: %s" % ", ".join(kwargs))
 
         # }}}
 
@@ -249,8 +282,8 @@ class GPUArray(object):
             raise ValueError("ary and self must be the same size")
         if ary.shape != self.shape:
             from warnings import warn
-            warn("Setting array from one with different shape.",
-                    stacklevel=2)
+
+            warn("Setting array from one with different shape.", stacklevel=2)
             ary = ary.reshape(self.shape)
 
         if ary.dtype != self.dtype:
@@ -275,8 +308,7 @@ class GPUArray(object):
             async_ = False
 
         if kwargs:
-            raise TypeError("extra keyword arguments specified: %s"
-                    % ", ".join(kwargs))
+            raise TypeError("extra keyword arguments specified: %s" % ", ".join(kwargs))
 
         # }}}
 
@@ -293,9 +325,13 @@ class GPUArray(object):
                 raise ValueError("self and ary must be the same size")
             if self.shape != ary.shape:
                 from warnings import warn
-                warn("get() between arrays of different shape is deprecated "
-                        "and will be removed in PyCUDA 2017.x",
-                        DeprecationWarning, stacklevel=2)
+
+                warn(
+                    "get() between arrays of different shape is deprecated "
+                    "and will be removed in PyCUDA 2017.x",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
                 ary = ary.reshape(self.shape)
 
             if self.dtype != ary.dtype:
@@ -332,19 +368,37 @@ class GPUArray(object):
         where `other` is a vector.."""
         assert self.shape == other.shape
         if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
         if add_timer is not None:
-            add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+            add_timer(
+                3 * self.size,
+                func.prepared_timed_call(
+                    self._grid,
+                    selffac,
+                    self.gpudata,
+                    otherfac,
+                    other.gpudata,
+                    out.gpudata,
+                    self.mem_size,
+                ),
+            )
         else:
-            func.prepared_async_call(self._grid, self._block, stream,
-                    selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                stream,
+                selffac,
+                self.gpudata,
+                otherfac,
+                other.gpudata,
+                out.gpudata,
+                self.mem_size,
+            )
 
         return out
 
@@ -352,43 +406,64 @@ class GPUArray(object):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            stream,
+            selffac,
+            self.gpudata,
+            other,
+            out.gpudata,
+            self.mem_size,
+        )
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
-        func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "*")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
+
+        func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*")
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            stream,
+            self.gpudata,
+            other.gpudata,
+            out.gpudata,
+            self.mem_size,
+        )
 
         return out
 
     def _rdiv_scalar(self, other, out, stream=None):
         """Divides an array by a scalar::
 
-           y = n / self
+        y = n / self
         """
 
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size)
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            stream,
+            self.gpudata,
+            other,
+            out.gpudata,
+            self.mem_size,
+        )
 
         return out
 
@@ -396,16 +471,22 @@ class GPUArray(object):
         """Divides an array by another array."""
 
         if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         assert self.shape == other.shape
 
-        func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "/")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+        func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/")
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            stream,
+            self.gpudata,
+            other.gpudata,
+            out.gpudata,
+            self.mem_size,
+        )
 
         return out
 
@@ -416,13 +497,13 @@ class GPUArray(object):
         if dtype == self.dtype:
             strides = self.strides
 
-        return self.__class__(self.shape, dtype,
-                allocator=self.allocator, strides=strides, order=order)
+        return self.__class__(
+            self.shape, dtype, allocator=self.allocator, strides=strides, order=order
+        )
 
     # operators ---------------------------------------------------------------
     def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None):
-        """Return `selffac * self + otherfac*other`.
-        """
+        """Return `selffac * self + otherfac*other`."""
         result = self._new_like_me(_get_common_dtype(self, other))
         return self._axpbyz(selffac, other, otherfac, result, add_timer)
 
@@ -460,7 +541,7 @@ class GPUArray(object):
     def __rsub__(self, other):
         """Substracts an array by a scalar or an array::
 
-           x = n - self
+        x = n - self
         """
         # other must be a scalar
         result = self._new_like_me(_get_common_dtype(self, other))
@@ -503,7 +584,7 @@ class GPUArray(object):
     def __div__(self, other):
         """Divides an array by an array or a scalar::
 
-           x = self / n
+        x = self / n
         """
         if isinstance(other, GPUArray):
             result = self._new_like_me(_get_common_dtype(self, other))
@@ -514,14 +595,14 @@ class GPUArray(object):
             else:
                 # create a new array for the result
                 result = self._new_like_me(_get_common_dtype(self, other))
-                return self._axpbz(1/other, 0, result)
+                return self._axpbz(1 / other, 0, result)
 
     __truediv__ = __div__
 
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array::
 
-           x = n / self
+        x = n / self
         """
         # create a new array for the result
         result = self._new_like_me(_get_common_dtype(self, other))
@@ -532,7 +613,7 @@ class GPUArray(object):
     def __idiv__(self, other):
         """Divides an array by an array or a scalar::
 
-           x /= n
+        x /= n
         """
         if isinstance(other, GPUArray):
             return self._div(other, self)
@@ -540,33 +621,43 @@ class GPUArray(object):
             if other == 1:
                 return self
             else:
-                return self._axpbz(1/other, 0, self)
+                return self._axpbz(1 / other, 0, self)
 
     __itruediv__ = __idiv__
 
     def fill(self, value, stream=None):
         """fills the array with the specified value"""
         func = elementwise.get_fill_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+        func.prepared_async_call(
+            self._grid, self._block, stream, value, self.gpudata, self.mem_size
+        )
 
         return self
 
     def bind_to_texref(self, texref, allow_offset=False):
-        return texref.set_address(self.gpudata, self.nbytes,
-                allow_offset=allow_offset) / self.dtype.itemsize
-
-    def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False,
-            allow_complex_hack=False, allow_offset=False):
+        return (
+            texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset)
+            / self.dtype.itemsize
+        )
+
+    def bind_to_texref_ext(
+        self,
+        texref,
+        channels=1,
+        allow_double_hack=False,
+        allow_complex_hack=False,
+        allow_offset=False,
+    ):
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         if self.dtype == np.float64 and allow_double_hack:
             if channels != 1:
                 raise ValueError(
-                        "'fake' double precision textures can "
-                        "only have one channel")
+                    "'fake' double precision textures can " "only have one channel"
+                )
 
             channels = 2
             fmt = drv.array_format.SIGNED_INT32
@@ -574,8 +665,8 @@ class GPUArray(object):
         elif self.dtype == np.complex64 and allow_complex_hack:
             if channels != 1:
                 raise ValueError(
-                        "'fake' complex64 textures can "
-                        "only have one channel")
+                    "'fake' complex64 textures can " "only have one channel"
+                )
 
             channels = 2
             fmt = drv.array_format.UNSIGNED_INT32
@@ -583,8 +674,8 @@ class GPUArray(object):
         elif self.dtype == np.complex128 and allow_complex_hack:
             if channels != 1:
                 raise ValueError(
-                        "'fake' complex128 textures can "
-                        "only have one channel")
+                    "'fake' complex128 textures can " "only have one channel"
+                )
 
             channels = 4
             fmt = drv.array_format.SIGNED_INT32
@@ -593,14 +684,15 @@ class GPUArray(object):
             fmt = drv.dtype_to_array_format(self.dtype)
             read_as_int = np.integer in self.dtype.type.__mro__
 
-        offset = texref.set_address(self.gpudata, self.nbytes,
-                allow_offset=allow_offset)
+        offset = texref.set_address(
+            self.gpudata, self.nbytes, allow_offset=allow_offset
+        )
         texref.set_format(fmt, channels)
 
         if read_as_int:
             texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER)
 
-        return offset/self.dtype.itemsize
+        return offset / self.dtype.itemsize
 
     def __len__(self):
         """Return the size of the leading dimension of self."""
@@ -625,16 +717,17 @@ class GPUArray(object):
 
         if issubclass(self.dtype.type, np.complexfloating):
             from pytools import match_precision
+
             out_dtype = match_precision(np.dtype(np.float64), self.dtype)
             result = self._new_like_me(out_dtype)
         else:
             out_dtype = self.dtype
 
-        func = elementwise.get_unary_func_kernel(fname, self.dtype,
-                out_dtype=out_dtype)
+        func = elementwise.get_unary_func_kernel(fname, self.dtype, out_dtype=out_dtype)
 
-        func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+        func.prepared_async_call(
+            self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size
+        )
 
         return result
 
@@ -646,8 +739,10 @@ class GPUArray(object):
 
         if isinstance(other, GPUArray):
             if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+                raise RuntimeError(
+                    "only contiguous arrays may "
+                    "be used as arguments to this operation"
+                )
 
             assert self.shape == other.shape
 
@@ -657,50 +752,64 @@ class GPUArray(object):
                 result = self
 
             func = elementwise.get_pow_array_kernel(
-                    self.dtype, other.dtype, result.dtype)
-
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+                self.dtype, other.dtype, result.dtype
+            )
+
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                other.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
         else:
             if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+                raise RuntimeError(
+                    "only contiguous arrays may "
+                    "be used as arguments to this operation"
+                )
 
             if new:
                 result = self._new_like_me()
             else:
                 result = self
             func = elementwise.get_pow_kernel(self.dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
-                    self.mem_size)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                other,
+                self.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
 
     def __pow__(self, other):
         """pow function::
 
-           example:
-                   array = pow(array)
-                   array = pow(array,4)
-                   array = pow(array,array)
+        example:
+                array = pow(array)
+                array = pow(array,4)
+                array = pow(array,array)
 
         """
-        return self._pow(other,new=True)
+        return self._pow(other, new=True)
 
     def __ipow__(self, other):
         """ipow function::
 
-           example:
-                   array **= 4
-                   array **= array
+        example:
+                array **= 4
+                array **= array
 
         """
-        return self._pow(other,new=False)
-
+        return self._pow(other, new=False)
 
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
@@ -708,22 +817,24 @@ class GPUArray(object):
         """
 
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         result = self._new_like_me()
 
         func = elementwise.get_reverse_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
-                self.mem_size)
+        func.prepared_async_call(
+            self._grid, self._block, stream, self.gpudata, result.gpudata, self.mem_size
+        )
 
         return result
 
     def astype(self, dtype, stream=None):
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         if dtype == self.dtype:
             return self.copy()
@@ -731,9 +842,9 @@ class GPUArray(object):
         result = self._new_like_me(dtype=dtype)
 
         func = elementwise.get_copy_kernel(dtype, self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
-                self.mem_size)
+        func.prepared_async_call(
+            self._grid, self._block, stream, result.gpudata, self.gpudata, self.mem_size
+        )
 
         return result
 
@@ -745,14 +856,16 @@ class GPUArray(object):
 
         # TODO: add more error-checking, perhaps
         if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
 
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
 
-        same_contiguity = ((order == "C" and self.flags.c_contiguous) or
-                           (order == "F" and self.flags.f_contiguous))
+        same_contiguity = (order == "C" and self.flags.c_contiguous) or (
+            order == "F" and self.flags.f_contiguous
+        )
 
         if shape == self.shape and same_contiguity:
             return self
@@ -771,12 +884,13 @@ class GPUArray(object):
             raise ValueError("total size of new array must be unchanged")
 
         return GPUArray(
-                shape=shape,
-                dtype=self.dtype,
-                allocator=self.allocator,
-                base=self,
-                gpudata=int(self.gpudata),
-                order=order)
+            shape=shape,
+            dtype=self.dtype,
+            allocator=self.allocator,
+            base=self,
+            gpudata=int(self.gpudata),
+            order=order,
+        )
 
     def ravel(self):
         return self.reshape(self.size)
@@ -789,29 +903,33 @@ class GPUArray(object):
         itemsize = np.dtype(dtype).itemsize
 
         from pytools import argmin2
+
         min_stride_axis = argmin2(
-                (axis, abs(stride))
-                for axis, stride in enumerate(self.strides))
+            (axis, abs(stride)) for axis, stride in enumerate(self.strides)
+        )
 
         if self.shape[min_stride_axis] * old_itemsize % itemsize != 0:
             raise ValueError("new type not compatible with array")
 
         new_shape = (
-                self.shape[:min_stride_axis]
-                + (self.shape[min_stride_axis] * old_itemsize // itemsize,)
-                + self.shape[min_stride_axis+1:])
+            self.shape[:min_stride_axis]
+            + (self.shape[min_stride_axis] * old_itemsize // itemsize,)
+            + self.shape[min_stride_axis + 1:]
+        )
         new_strides = (
-                self.strides[:min_stride_axis]
-                + (self.strides[min_stride_axis] * itemsize // old_itemsize,)
-                + self.strides[min_stride_axis+1:])
+            self.strides[:min_stride_axis]
+            + (self.strides[min_stride_axis] * itemsize // old_itemsize,)
+            + self.strides[min_stride_axis + 1:]
+        )
 
         return GPUArray(
-                shape=new_shape,
-                dtype=dtype,
-                allocator=self.allocator,
-                strides=new_strides,
-                base=self,
-                gpudata=int(self.gpudata))
+            shape=new_shape,
+            dtype=dtype,
+            allocator=self.allocator,
+            strides=new_strides,
+            base=self,
+            gpudata=int(self.gpudata),
+        )
 
     def squeeze(self):
         """
@@ -819,8 +937,9 @@ class GPUArray(object):
         length 1 removed.
         """
         new_shape = tuple([dim for dim in self.shape if dim > 1])
-        new_strides = tuple([self.strides[i]
-            for i, dim in enumerate(self.shape) if dim > 1])
+        new_strides = tuple(
+            [self.strides[i] for i, dim in enumerate(self.shape) if dim > 1]
+        )
 
         return GPUArray(
             shape=new_shape,
@@ -828,7 +947,8 @@ class GPUArray(object):
             allocator=self.allocator,
             strides=new_strides,
             base=self,
-            gpudata=int(self.gpudata))
+            gpudata=int(self.gpudata),
+        )
 
     def transpose(self, axes=None):
         """Permute the dimensions of an array.
@@ -843,17 +963,19 @@ class GPUArray(object):
         """
 
         if axes is None:
-            axes = range(self.ndim-1, -1, -1)
+            axes = range(self.ndim - 1, -1, -1)
         if len(axes) != len(self.shape):
             raise ValueError("axes don't match array")
         new_shape = [self.shape[axes[i]] for i in range(len(axes))]
         new_strides = [self.strides[axes[i]] for i in range(len(axes))]
-        return GPUArray(shape=tuple(new_shape),
-                        dtype=self.dtype,
-                        allocator=self.allocator,
-                        base=self.base or self,
-                        gpudata=self.gpudata,
-                        strides=tuple(new_strides))
+        return GPUArray(
+            shape=tuple(new_shape),
+            dtype=self.dtype,
+            allocator=self.allocator,
+            base=self.base or self,
+            gpudata=self.gpudata,
+            strides=tuple(new_strides),
+        )
 
     @property
     def T(self):  # noqa
@@ -886,14 +1008,13 @@ class GPUArray(object):
                 raise IndexError("too many axes in index")
 
             if isinstance(index_entry, slice):
-                start, stop, idx_stride = index_entry.indices(
-                        self.shape[array_axis])
+                start, stop, idx_stride = index_entry.indices(self.shape[array_axis])
 
                 array_stride = self.strides[array_axis]
 
-                new_shape.append((abs(stop-start)-1)//abs(idx_stride)+1)
-                new_strides.append(idx_stride*array_stride)
-                new_offset += array_stride*start
+                new_shape.append((abs(stop - start) - 1) // abs(idx_stride) + 1)
+                new_strides.append(idx_stride * array_stride)
+                new_offset += array_stride * start
 
                 index_axis += 1
                 array_axis += 1
@@ -904,10 +1025,9 @@ class GPUArray(object):
                     index_entry += array_shape
 
                 if not (0 <= index_entry < array_shape):
-                    raise IndexError(
-                            "subindex in axis %d out of range" % index_axis)
+                    raise IndexError("subindex in axis %d out of range" % index_axis)
 
-                new_offset += self.strides[array_axis]*index_entry
+                new_offset += self.strides[array_axis] * index_entry
 
                 index_axis += 1
                 array_axis += 1
@@ -925,8 +1045,7 @@ class GPUArray(object):
                     array_axis += 1
 
                 if seen_ellipsis:
-                    raise IndexError(
-                            "more than one ellipsis not allowed in index")
+                    raise IndexError("more than one ellipsis not allowed in index")
                 seen_ellipsis = True
 
             elif index_entry is np.newaxis:
@@ -944,12 +1063,13 @@ class GPUArray(object):
             array_axis += 1
 
         return GPUArray(
-                shape=tuple(new_shape),
-                dtype=self.dtype,
-                allocator=self.allocator,
-                base=self,
-                gpudata=int(self.gpudata)+new_offset,
-                strides=tuple(new_strides))
+            shape=tuple(new_shape),
+            dtype=self.dtype,
+            allocator=self.allocator,
+            base=self,
+            gpudata=int(self.gpudata) + new_offset,
+            strides=tuple(new_strides),
+        )
 
     def __setitem__(self, index, value):
         _memcpy_discontig(self[index], value)
@@ -963,6 +1083,7 @@ class GPUArray(object):
         dtype = self.dtype
         if issubclass(dtype.type, np.complexfloating):
             from pytools import match_precision
+
             real_dtype = match_precision(np.dtype(np.float64), dtype)
             if self.flags.f_contiguous:
                 order = "F"
@@ -971,9 +1092,14 @@ class GPUArray(object):
             result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
         else:
@@ -984,10 +1110,13 @@ class GPUArray(object):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
             if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+                raise RuntimeError(
+                    "only contiguous arrays may "
+                    "be used as arguments to this operation"
+                )
 
             from pytools import match_precision
+
             real_dtype = match_precision(np.dtype(np.float64), dtype)
             if self.flags.f_contiguous:
                 order = "F"
@@ -996,9 +1125,14 @@ class GPUArray(object):
             result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
         else:
@@ -1008,8 +1142,10 @@ class GPUArray(object):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
             if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+                raise RuntimeError(
+                    "only contiguous arrays may "
+                    "be used as arguments to this operation"
+                )
 
             if self.flags.f_contiguous:
                 order = "F"
@@ -1018,9 +1154,14 @@ class GPUArray(object):
             result = self._new_like_me(order=order)
 
             func = elementwise.get_conj_kernel(dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            func.prepared_async_call(
+                self._grid,
+                self._block,
+                None,
+                self.gpudata,
+                result.gpudata,
+                self.mem_size,
+            )
 
             return result
         else:
@@ -1039,11 +1180,13 @@ class GPUArray(object):
 
     # }}}
 
+
 # }}}
 
 
 # {{{ creation helpers
 
+
 def to_gpu(ary, allocator=drv.mem_alloc):
     """converts a numpy array to a GPUArray"""
     result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
@@ -1089,7 +1232,7 @@ def _array_like_helper(other_ary, dtype, order):
                 # scale strides by itemsize when dtype is not the same
                 itemsize = other_ary.nbytes // other_ary.size
                 itemsize_ratio = np.dtype(dtype).itemsize / itemsize
-                strides = [int(s*itemsize_ratio) for s in strides]
+                strides = [int(s * itemsize_ratio) for s in strides]
     elif order not in ["C", "F"]:
         raise ValueError("Unsupported order: %r" % order)
     if dtype is None:
@@ -1100,16 +1243,16 @@ def _array_like_helper(other_ary, dtype, order):
 def empty_like(other_ary, dtype=None, order="K"):
     dtype, order, strides = _array_like_helper(other_ary, dtype, order)
     result = GPUArray(
-            other_ary.shape, dtype, other_ary.allocator, order=order,
-            strides=strides)
+        other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides
+    )
     return result
 
 
 def zeros_like(other_ary, dtype=None, order="K"):
     dtype, order, strides = _array_like_helper(other_ary, dtype, order)
     result = GPUArray(
-            other_ary.shape, dtype, other_ary.allocator, order=order,
-            strides=strides)
+        other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides
+    )
     zero = np.zeros((), result.dtype)
     result.fill(zero)
     return result
@@ -1118,8 +1261,8 @@ def zeros_like(other_ary, dtype=None, order="K"):
 def ones_like(other_ary, dtype=None, order="K"):
     dtype, order, strides = _array_like_helper(other_ary, dtype, order)
     result = GPUArray(
-            other_ary.shape, dtype, other_ary.allocator, order=order,
-            strides=strides)
+        other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides
+    )
     one = np.ones((), result.dtype)
     result.fill(one)
     return result
@@ -1171,7 +1314,7 @@ def arange(*args, **kwargs):
         raise ValueError("too many arguments")
 
     admissible_names = ["start", "stop", "step", "dtype"]
-    for k, v in six.iteritems(kwargs):
+    for k, v in kwargs.items():
         if k in admissible_names:
             if getattr(inf, k) is None:
                 setattr(inf, k, v)
@@ -1197,30 +1340,42 @@ def arange(*args, **kwargs):
 
     if not explicit_dtype and dtype != np.float32:
         from warnings import warn
-        warn("behavior change: arange guessed dtype other than float32. "
-                "suggest specifying explicit dtype.")
+
+        warn(
+            "behavior change: arange guessed dtype other than float32. "
+            "suggest specifying explicit dtype."
+        )
 
     from math import ceil
-    size = int(ceil((stop-start)/step))
+
+    size = int(ceil((stop - start) / step))
 
     result = GPUArray((size,), dtype)
 
     func = elementwise.get_arange_kernel(dtype)
-    func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size)
+    func.prepared_async_call(
+        result._grid,
+        result._block,
+        kwargs.get("stream"),
+        result.gpudata,
+        start,
+        step,
+        size,
+    )
 
     return result
 
+
 # }}}
 
 
 def _compact_strides(a):
     # Compute strides to have same order as self, but packed
     info = sorted(
-            (a.strides[axis], a.shape[axis], axis)
-            for axis in range(len(a.shape)))
+        (a.strides[axis], a.shape[axis], axis) for axis in range(len(a.shape))
+    )
 
-    strides = [None]*len(a.shape)
+    strides = [None] * len(a.shape)
     stride = a.dtype.itemsize
     for _, dim, axis in info:
         strides[axis] = stride
@@ -1257,17 +1412,21 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
         # put src in Fortran order (which should put dst in Fortran order too)
         # and remove singleton axes
         src_info = sorted(
-                (src.strides[axis], axis)
-                for axis in range(len(src.shape)) if src.shape[axis] > 1)
+            (src.strides[axis], axis)
+            for axis in range(len(src.shape))
+            if src.shape[axis] > 1
+        )
         axes = [axis for _, axis in src_info]
         shape = [src.shape[axis] for axis in axes]
         src_strides = [src.strides[axis] for axis in axes]
         dst_strides = [dst.strides[axis] for axis in axes]
 
         # copy functions require contiguity in minor axis, so add new axis if needed
-        if (len(shape) == 0
-                or src_strides[0] != src.dtype.itemsize
-                or dst_strides[0] != dst.dtype.itemsize):
+        if (
+            len(shape) == 0
+            or src_strides[0] != src.dtype.itemsize
+            or dst_strides[0] != dst.dtype.itemsize
+        ):
             shape[0:0] = [1]
             src_strides[0:0] = [0]
             dst_strides[0:0] = [0]
@@ -1277,11 +1436,13 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
         # and check that dst is in same order as src
         i = 1
         while i < len(shape):
-            if dst_strides[i] < dst_strides[i-1]:
+            if dst_strides[i] < dst_strides[i - 1]:
                 raise ValueError("src and dst must have same order")
-            if (src_strides[i-1] * shape[i-1] == src_strides[i] and
-                    dst_strides[i-1] * shape[i-1] == dst_strides[i]):
-                shape[i-1:i+1] = [shape[i-1] * shape[i]]
+            if (
+                src_strides[i - 1] * shape[i - 1] == src_strides[i]
+                and dst_strides[i - 1] * shape[i - 1] == dst_strides[i]
+            ):
+                shape[i - 1:i + 1] = [shape[i - 1] * shape[i]]
                 del src_strides[i]
                 del dst_strides[i]
                 del axes[i]
@@ -1293,7 +1454,8 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
             if isinstance(dst, GPUArray):
                 if async_:
                     drv.memcpy_dtod_async(
-                            dst.gpudata, src.gpudata, src.nbytes, stream=stream)
+                        dst.gpudata, src.gpudata, src.nbytes, stream=stream
+                    )
                 else:
                     drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
             else:
@@ -1301,8 +1463,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
                 # having no gaps, but the axes could be transposed
                 # so that the order is neither Fortran or C.
                 # So, we attempt to get a contiguous view of dst.
-                dst = _as_strided(
-                        dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
+                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                 if async_:
                     drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                 else:
@@ -1321,8 +1482,8 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
         copy = drv.Memcpy3D()
     else:
         raise ValueError(
-                "more than 2 discontiguous axes not supported %s"
-                % (tuple(sorted(axes)),))
+            "more than 2 discontiguous axes not supported {}".format(tuple(sorted(axes)))
+        )
 
     if isinstance(src, GPUArray):
         copy.set_src_device(src.gpudata)
@@ -1334,7 +1495,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
     else:
         copy.set_dst_host(dst)
 
-    copy.width_in_bytes = src.dtype.itemsize*shape[0]
+    copy.width_in_bytes = src.dtype.itemsize * shape[0]
 
     copy.src_pitch = src_strides[1]
     copy.dst_pitch = dst_strides[1]
@@ -1348,13 +1509,11 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
 
     else:  # len(shape) == 3
         if src_strides[2] % src_strides[1] != 0:
-            raise RuntimeError(
-                    "src's major stride must be a multiple of middle stride")
+            raise RuntimeError("src's major stride must be a multiple of middle stride")
         copy.src_height = src_strides[2] // src_strides[1]
 
         if dst_strides[2] % dst_strides[1] != 0:
-            raise RuntimeError(
-                    "dst's major stride must be a multiple of middle stride")
+            raise RuntimeError("dst's major stride must be a multiple of middle stride")
         copy.dst_height = dst_strides[2] // dst_strides[1]
 
         copy.depth = shape[2]
@@ -1366,16 +1525,14 @@ def _memcpy_discontig(dst, src, async_=False, stream=None):
 
 # {{{ pickle support
 
-import six.moves.copyreg
-six.moves.copyreg.pickle(GPUArray,
-                lambda data: (to_gpu, (data.get(),)),
-                to_gpu)
+copyreg.pickle(GPUArray, lambda data: (to_gpu, (data.get(),)), to_gpu)
 
 # }}}
 
 
 # {{{ take/put
 
+
 def take(a, indices, out=None, stream=None):
     if out is None:
         out = GPUArray(indices.shape, a.dtype, a.allocator)
@@ -1385,8 +1542,9 @@ def take(a, indices, out=None, stream=None):
     func, tex_src = elementwise.get_take_kernel(a.dtype, indices.dtype)
     a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True)
 
-    func.prepared_async_call(out._grid, out._block, stream,
-            indices.gpudata, out.gpudata, indices.size)
+    func.prepared_async_call(
+        out._grid, out._block, stream, indices.gpudata, out.gpudata, indices.size
+    )
 
     return out
 
@@ -1398,14 +1556,14 @@ def multi_take(arrays, indices, out=None, stream=None):
     assert len(indices.shape) == 1
 
     from pytools import single_valued
+
     a_dtype = single_valued(a.dtype for a in arrays)
     a_allocator = arrays[0].dtype
 
     vec_count = len(arrays)
 
     if out is None:
-        out = [GPUArray(indices.shape, a_dtype, a_allocator)
-                for i in range(vec_count)]
+        out = [GPUArray(indices.shape, a_dtype, a_allocator) for i in range(vec_count)]
     else:
         if len(out) != len(arrays):
             raise ValueError("out and arrays must have the same length")
@@ -1413,42 +1571,51 @@ def multi_take(arrays, indices, out=None, stream=None):
     chunk_size = _builtin_min(vec_count, 20)
 
     def make_func_for_chunk_size(chunk_size):
-        return elementwise.get_take_kernel(a_dtype, indices.dtype,
-                vec_count=chunk_size)
+        return elementwise.get_take_kernel(a_dtype, indices.dtype, vec_count=chunk_size)
 
     func, tex_src = make_func_for_chunk_size(chunk_size)
 
     for start_i in range(0, len(arrays), chunk_size):
-        chunk_slice = slice(start_i, start_i+chunk_size)
+        chunk_slice = slice(start_i, start_i + chunk_size)
 
         if start_i + chunk_size > vec_count:
-            func, tex_src = make_func_for_chunk_size(vec_count-start_i)
+            func, tex_src = make_func_for_chunk_size(vec_count - start_i)
 
         for i, a in enumerate(arrays[chunk_slice]):
             a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)
 
-        func.prepared_async_call(indices._grid, indices._block, stream,
-                indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + [indices.size]))
+        func.prepared_async_call(
+            indices._grid,
+            indices._block,
+            stream,
+            indices.gpudata,
+            *([o.gpudata for o in out[chunk_slice]] + [indices.size])
+        )
 
     return out
 
 
-def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
-        out=None, stream=None, src_offsets=None):
+def multi_take_put(
+    arrays,
+    dest_indices,
+    src_indices,
+    dest_shape=None,
+    out=None,
+    stream=None,
+    src_offsets=None,
+):
     if not len(arrays):
         return []
 
     from pytools import single_valued
+
     a_dtype = single_valued(a.dtype for a in arrays)
     a_allocator = arrays[0].allocator
 
     vec_count = len(arrays)
 
     if out is None:
-        out = [GPUArray(dest_shape, a_dtype, a_allocator)
-                for i in range(vec_count)]
+        out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)]
     else:
         if a_dtype != single_valued(o.dtype for o in out):
             raise TypeError("arrays and out must have the same dtype")
@@ -1477,26 +1644,35 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
 
     def make_func_for_chunk_size(chunk_size):
         return elementwise.get_take_put_kernel(
-                a_dtype, src_indices.dtype,
-                with_offsets=src_offsets is not None,
-                vec_count=chunk_size)
+            a_dtype,
+            src_indices.dtype,
+            with_offsets=src_offsets is not None,
+            vec_count=chunk_size,
+        )
 
     func, tex_src = make_func_for_chunk_size(chunk_size)
 
     for start_i in range(0, len(arrays), chunk_size):
-        chunk_slice = slice(start_i, start_i+chunk_size)
+        chunk_slice = slice(start_i, start_i + chunk_size)
 
         if start_i + chunk_size > vec_count:
-            func, tex_src = make_func_for_chunk_size(vec_count-start_i)
+            func, tex_src = make_func_for_chunk_size(vec_count - start_i)
 
         for src_tr, a in zip(tex_src, arrays[chunk_slice]):
             a.bind_to_texref_ext(src_tr, allow_double_hack=True)
 
-        func.prepared_async_call(src_indices._grid,  src_indices._block, stream,
-                dest_indices.gpudata, src_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + src_offsets_list[chunk_slice]
-                    + [src_indices.size]))
+        func.prepared_async_call(
+            src_indices._grid,
+            src_indices._block,
+            stream,
+            dest_indices.gpudata,
+            src_indices.gpudata,
+            *(
+                [o.gpudata for o in out[chunk_slice]]
+                + src_offsets_list[chunk_slice]
+                + [src_indices.size]
+            )
+        )
 
     return out
 
@@ -1506,14 +1682,14 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
         return []
 
     from pytools import single_valued
+
     a_dtype = single_valued(a.dtype for a in arrays)
     a_allocator = arrays[0].allocator
 
     vec_count = len(arrays)
 
     if out is None:
-        out = [GPUArray(dest_shape, a_dtype, a_allocator)
-                for i in range(vec_count)]
+        out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)]
     else:
         if a_dtype != single_valued(o.dtype for o in out):
             raise TypeError("arrays and out must have the same dtype")
@@ -1527,29 +1703,38 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
 
     def make_func_for_chunk_size(chunk_size):
         return elementwise.get_put_kernel(
-                a_dtype, dest_indices.dtype, vec_count=chunk_size)
+            a_dtype, dest_indices.dtype, vec_count=chunk_size
+        )
 
     func = make_func_for_chunk_size(chunk_size)
 
     for start_i in range(0, len(arrays), chunk_size):
-        chunk_slice = slice(start_i, start_i+chunk_size)
+        chunk_slice = slice(start_i, start_i + chunk_size)
 
         if start_i + chunk_size > vec_count:
-            func = make_func_for_chunk_size(vec_count-start_i)
-
-        func.prepared_async_call(dest_indices._grid, dest_indices._block, stream,
-                dest_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + [i.gpudata for i in arrays[chunk_slice]]
-                    + [dest_indices.size]))
+            func = make_func_for_chunk_size(vec_count - start_i)
+
+        func.prepared_async_call(
+            dest_indices._grid,
+            dest_indices._block,
+            stream,
+            dest_indices.gpudata,
+            *(
+                [o.gpudata for o in out[chunk_slice]]
+                + [i.gpudata for i in arrays[chunk_slice]]
+                + [dest_indices.size]
+            )
+        )
 
     return out
 
+
 # }}}
 
 
 # {{{ shape manipulation
 
+
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
 
@@ -1573,11 +1758,13 @@ def reshape(a, *shape, **kwargs):
 
     return a.reshape(*shape, **kwargs)
 
+
 # }}}
 
 
 # {{{ conditionals
 
+
 def if_positive(criterion, then_, else_, out=None, stream=None):
     if not (criterion.shape == then_.shape == else_.shape):
         raise ValueError("shapes do not match")
@@ -1585,15 +1772,21 @@ def if_positive(criterion, then_, else_, out=None, stream=None):
     if not (then_.dtype == else_.dtype):
         raise ValueError("dtypes do not match")
 
-    func = elementwise.get_if_positive_kernel(
-            criterion.dtype, then_.dtype)
+    func = elementwise.get_if_positive_kernel(criterion.dtype, then_.dtype)
 
     if out is None:
         out = empty_like(then_)
 
-    func.prepared_async_call(criterion._grid, criterion._block, stream,
-            criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata,
-            criterion.size)
+    func.prepared_async_call(
+        criterion._grid,
+        criterion._block,
+        stream,
+        criterion.gpudata,
+        then_.gpudata,
+        else_.gpudata,
+        out.gpudata,
+        criterion.size,
+    )
 
     return out
 
@@ -1603,27 +1796,34 @@ def _make_binary_minmax_func(which):
         if isinstance(a, GPUArray) and isinstance(b, GPUArray):
             if out is None:
                 out = empty_like(a)
-            func = elementwise.get_binary_minmax_kernel(which,
-                    a.dtype, b.dtype, out.dtype, use_scalar=False)
+            func = elementwise.get_binary_minmax_kernel(
+                which, a.dtype, b.dtype, out.dtype, use_scalar=False
+            )
 
-            func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b.gpudata, out.gpudata, a.size)
+            func.prepared_async_call(
+                a._grid, a._block, stream, a.gpudata, b.gpudata, out.gpudata, a.size
+            )
         elif isinstance(a, GPUArray):
             if out is None:
                 out = empty_like(a)
-            func = elementwise.get_binary_minmax_kernel(which,
-                    a.dtype, a.dtype, out.dtype, use_scalar=True)
-            func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b, out.gpudata, a.size)
+            func = elementwise.get_binary_minmax_kernel(
+                which, a.dtype, a.dtype, out.dtype, use_scalar=True
+            )
+            func.prepared_async_call(
+                a._grid, a._block, stream, a.gpudata, b, out.gpudata, a.size
+            )
         else:  # assuming b is a GPUArray
             if out is None:
                 out = empty_like(b)
-            func = elementwise.get_binary_minmax_kernel(which,
-                    b.dtype, b.dtype, out.dtype, use_scalar=True)
+            func = elementwise.get_binary_minmax_kernel(
+                which, b.dtype, b.dtype, out.dtype, use_scalar=True
+            )
             # NOTE: we switch the order of a and b here!
-            func.prepared_async_call(b._grid, b._block, stream,
-                    b.gpudata, a, out.gpudata, b.size)
+            func.prepared_async_call(
+                b._grid, b._block, stream, b.gpudata, a, out.gpudata, b.size
+            )
         return out
+
     return f
 
 
@@ -1635,20 +1835,24 @@ maximum = _make_binary_minmax_func("max")
 
 # {{{ reductions
 
+
 def sum(a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_sum_kernel
+
     krnl = get_sum_kernel(dtype, a.dtype)
     return krnl(a, stream=stream, allocator=allocator)
 
 
 def subset_sum(subset, a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_sum_kernel
+
     krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype)
     return krnl(subset, a, stream=stream)
 
 
 def dot(a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_dot_kernel
+
     if dtype is None:
         dtype = _get_common_dtype(a, b)
     krnl = get_dot_kernel(dtype, a.dtype, b.dtype)
@@ -1657,6 +1861,7 @@ def dot(a, b, dtype=None, stream=None, allocator=None):
 
 def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_dot_kernel
+
     krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype)
     return krnl(subset, a, b, stream=stream, allocator=allocator)
 
@@ -1664,11 +1869,13 @@ def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None):
 def _make_minmax_kernel(what):
     def f(a, stream=None):
         from pycuda.reduction import get_minmax_kernel
+
         krnl = get_minmax_kernel(what, a.dtype)
-        return krnl(a,  stream=stream)
+        return krnl(a, stream=stream)
 
     return f
 
+
 _builtin_min = min
 _builtin_max = max
 min = _make_minmax_kernel("min")
@@ -1678,11 +1885,13 @@ max = _make_minmax_kernel("max")
 def _make_subset_minmax_kernel(what):
     def f(subset, a, stream=None):
         from pycuda.reduction import get_subset_minmax_kernel
+
         krnl = get_subset_minmax_kernel(what, a.dtype, subset.dtype)
-        return krnl(subset, a,  stream=stream)
+        return krnl(subset, a, stream=stream)
 
     return f
 
+
 subset_min = _make_subset_minmax_kernel("min")
 subset_max = _make_subset_minmax_kernel("max")
 
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index 939a006a9617754c4ce24111034f08e4838c2a97..2651353f6d49c3bfb4a1e9d584aa6c58f4718fec 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -1,8 +1,5 @@
 """Computation of reductions on vectors."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import zip
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -65,11 +62,21 @@ from pycuda.tools import dtype_to_ctype
 import numpy as np
 
 
-def get_reduction_module(out_type, block_size,
-        neutral, reduce_expr, map_expr, arguments,
-        name="reduce_kernel", keep=False, options=None, preamble=""):
+def get_reduction_module(
+    out_type,
+    block_size,
+    neutral,
+    reduce_expr,
+    map_expr,
+    arguments,
+    name="reduce_kernel",
+    keep=False,
+    options=None,
+    preamble="",
+):
 
     from pycuda.compiler import SourceModule
+
     src = """
         #include <pycuda-complex.hpp>
 
@@ -138,23 +145,31 @@ def get_reduction_module(out_type, block_size,
           if (tid == 0) out[blockIdx.x] = sdata[0];
         }
         """ % {
-            "out_type": out_type,
-            "arguments": arguments,
-            "block_size": block_size,
-            "neutral": neutral,
-            "reduce_expr": reduce_expr,
-            "map_expr": map_expr,
-            "name": name,
-            "preamble": preamble
-            }
+        "out_type": out_type,
+        "arguments": arguments,
+        "block_size": block_size,
+        "neutral": neutral,
+        "reduce_expr": reduce_expr,
+        "map_expr": map_expr,
+        "name": name,
+        "preamble": preamble,
+    }
     return SourceModule(src, options=options, keep=keep, no_extern_c=True)
 
 
-
-
-def get_reduction_kernel_and_types(stage, out_type, block_size,
-        neutral, reduce_expr, map_expr=None, arguments=None,
-        name="reduce_kernel", keep=False, options=None, preamble=""):
+def get_reduction_kernel_and_types(
+    stage,
+    out_type,
+    block_size,
+    neutral,
+    reduce_expr,
+    map_expr=None,
+    arguments=None,
+    name="reduce_kernel",
+    keep=False,
+    options=None,
+    preamble="",
+):
 
     if stage == 1:
         if map_expr is None:
@@ -173,11 +188,21 @@ def get_reduction_kernel_and_types(stage, out_type, block_size,
     else:
         assert False
 
-    mod = get_reduction_module(out_type, block_size,
-            neutral, reduce_expr, map_expr, arguments,
-            name, keep, options, preamble)
+    mod = get_reduction_module(
+        out_type,
+        block_size,
+        neutral,
+        reduce_expr,
+        map_expr,
+        arguments,
+        name,
+        keep,
+        options,
+        preamble,
+    )
 
     from pycuda.tools import get_arg_type
+
     func = mod.get_function(name)
     arg_types = [get_arg_type(arg) for arg in arguments.split(",")]
     func.prepare("P%sII" % "".join(arg_types))
@@ -185,35 +210,58 @@ def get_reduction_kernel_and_types(stage, out_type, block_size,
     return func, arg_types
 
 
-
-
 class ReductionKernel:
-    def __init__(self, dtype_out,
-            neutral, reduce_expr, map_expr=None, arguments=None,
-            name="reduce_kernel", keep=False, options=None, preamble=""):
+    def __init__(
+        self,
+        dtype_out,
+        neutral,
+        reduce_expr,
+        map_expr=None,
+        arguments=None,
+        name="reduce_kernel",
+        keep=False,
+        options=None,
+        preamble="",
+    ):
 
         self.dtype_out = np.dtype(dtype_out)
 
         self.block_size = 512
 
         s1_func, self.stage1_arg_types = get_reduction_kernel_and_types(
-                1, dtype_to_ctype(dtype_out), self.block_size,
-                neutral, reduce_expr, map_expr,
-                arguments, name=name+"_stage1", keep=keep, options=options,
-                preamble=preamble)
+            1,
+            dtype_to_ctype(dtype_out),
+            self.block_size,
+            neutral,
+            reduce_expr,
+            map_expr,
+            arguments,
+            name=name + "_stage1",
+            keep=keep,
+            options=options,
+            preamble=preamble,
+        )
         self.stage1_func = s1_func.prepared_async_call
 
         # stage 2 has only one input and no map expression
         s2_func, self.stage2_arg_types = get_reduction_kernel_and_types(
-                2, dtype_to_ctype(dtype_out), self.block_size,
-                neutral, reduce_expr, arguments=arguments,
-                name=name+"_stage2", keep=keep, options=options,
-                preamble=preamble)
+            2,
+            dtype_to_ctype(dtype_out),
+            self.block_size,
+            neutral,
+            reduce_expr,
+            arguments=arguments,
+            name=name + "_stage2",
+            keep=keep,
+            options=options,
+            preamble=preamble,
+        )
         self.stage2_func = s2_func.prepared_async_call
 
-        assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \
-                "ReductionKernel can only be used with functions that have at least one " \
-                "vector argument"
+        assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], (
+            "ReductionKernel can only be used with functions that have at least one "
+            "vector argument"
+        )
 
     def __call__(self, *args, **kwargs):
         MAX_BLOCK_COUNT = 1024
@@ -244,8 +292,9 @@ class ReductionKernel:
             for arg, arg_tp in zip(args, arg_types):
                 if arg_tp == "P":
                     if not arg.flags.forc:
-                        raise RuntimeError("ReductionKernel cannot "
-                                "deal with non-contiguous arrays")
+                        raise RuntimeError(
+                            "ReductionKernel cannot " "deal with non-contiguous arrays"
+                        )
 
                     vectors.append(arg)
                     invocation_args.append(arg.gpudata)
@@ -259,13 +308,13 @@ class ReductionKernel:
             if allocator is None:
                 allocator = repr_vec.allocator
 
-            if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
-                total_block_size = SMALL_SEQ_COUNT*self.block_size
+            if sz <= self.block_size * SMALL_SEQ_COUNT * MAX_BLOCK_COUNT:
+                total_block_size = SMALL_SEQ_COUNT * self.block_size
                 block_count = (sz + total_block_size - 1) // total_block_size
                 seq_count = SMALL_SEQ_COUNT
             else:
                 block_count = MAX_BLOCK_COUNT
-                macroblock_size = block_count*self.block_size
+                macroblock_size = block_count * self.block_size
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
             if block_count == 1 and out is not None:
@@ -279,12 +328,16 @@ class ReductionKernel:
             else:
                 result = empty((block_count,), self.dtype_out, allocator=allocator)
 
-            kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize)
+            kwargs = dict(shared_size=self.block_size * self.dtype_out.itemsize)
 
             # print block_count, seq_count, self.block_size, sz
-            f((block_count, 1), (self.block_size, 1, 1), stream,
-                    *([result.gpudata]+invocation_args+[seq_count, sz]),
-                    **kwargs)
+            f(
+                (block_count, 1),
+                (self.block_size, 1, 1),
+                stream,
+                *([result.gpudata] + invocation_args + [seq_count, sz]),
+                **kwargs
+            )
 
             if block_count == 1:
                 return result
@@ -294,17 +347,17 @@ class ReductionKernel:
                 args = (result,) + stage1_args
 
 
-
-
 @context_dependent_memoize
 def get_sum_kernel(dtype_out, dtype_in):
     if dtype_out is None:
         dtype_out = dtype_in
 
-    return ReductionKernel(dtype_out, "0", "a+b",
-            arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)})
-
-
+    return ReductionKernel(
+        dtype_out,
+        "0",
+        "a+b",
+        arguments="const {tp} *in".format(tp=dtype_to_ctype(dtype_in)),
+    )
 
 
 @context_dependent_memoize
@@ -312,27 +365,33 @@ def get_subset_sum_kernel(dtype_out, dtype_subset, dtype_in):
     if dtype_out is None:
         dtype_out = dtype_in
 
-    return ReductionKernel(dtype_out, "0", "a+b",
-            map_expr="in[lookup_tbl[i]]",
-            arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in"
-            % {
-                "tp": dtype_to_ctype(dtype_in),
-                "tp_lut": dtype_to_ctype(dtype_subset),
-                })
-
-
+    return ReductionKernel(
+        dtype_out,
+        "0",
+        "a+b",
+        map_expr="in[lookup_tbl[i]]",
+        arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in"
+        % {
+            "tp": dtype_to_ctype(dtype_in),
+            "tp_lut": dtype_to_ctype(dtype_subset),
+        },
+    )
 
 
 @context_dependent_memoize
 def get_dot_kernel(dtype_out, dtype_a, dtype_b):
-    return ReductionKernel(dtype_out, neutral="0",
-            reduce_expr="a+b", map_expr="a[i]*b[i]",
-            arguments="const %(tp_a)s *a, const %(tp_b)s *b" % {
-                "tp_a": dtype_to_ctype(dtype_a),
-                "tp_b": dtype_to_ctype(dtype_b),
-                }, keep=True)
-
-
+    return ReductionKernel(
+        dtype_out,
+        neutral="0",
+        reduce_expr="a+b",
+        map_expr="a[i]*b[i]",
+        arguments="const %(tp_a)s *a, const %(tp_b)s *b"
+        % {
+            "tp_a": dtype_to_ctype(dtype_a),
+            "tp_b": dtype_to_ctype(dtype_b),
+        },
+        keep=True,
+    )
 
 
 @context_dependent_memoize
@@ -350,16 +409,19 @@ def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None):
         dtype_a = dtype_out
 
     # important: lookup_tbl must be first--it controls the length
-    return ReductionKernel(dtype_out, neutral="0",
-            reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]",
-            arguments="const %(tp_lut)s *lookup_tbl, "
-            "const %(tp_a)s *a, const %(tp_b)s *b" % {
+    return ReductionKernel(
+        dtype_out,
+        neutral="0",
+        reduce_expr="a+b",
+        map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]",
+        arguments="const %(tp_lut)s *lookup_tbl, "
+        "const %(tp_a)s *a, const %(tp_b)s *b"
+        % {
             "tp_a": dtype_to_ctype(dtype_a),
             "tp_b": dtype_to_ctype(dtype_b),
             "tp_lut": dtype_to_ctype(dtype_subset),
-            })
-
-
+        },
+    )
 
 
 def get_minmax_neutral(what, dtype):
@@ -380,8 +442,6 @@ def get_minmax_neutral(what, dtype):
             raise ValueError("what is not min or max.")
 
 
-
-
 @context_dependent_memoize
 def get_minmax_kernel(what, dtype):
     if dtype == np.float64:
@@ -393,14 +453,16 @@ def get_minmax_kernel(what, dtype):
     else:
         raise TypeError("unsupported dtype specified")
 
-    return ReductionKernel(dtype,
-            neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
-            arguments="const %(tp)s *in" % {
-                "tp": dtype_to_ctype(dtype),
-                }, preamble="#define MY_INFINITY (1./0)")
-
-
+    return ReductionKernel(
+        dtype,
+        neutral=get_minmax_neutral(what, dtype),
+        reduce_expr=f"{reduce_expr}",
+        arguments="const %(tp)s *in"
+        % {
+            "tp": dtype_to_ctype(dtype),
+        },
+        preamble="#define MY_INFINITY (1./0)",
+    )
 
 
 @context_dependent_memoize
@@ -414,12 +476,16 @@ def get_subset_minmax_kernel(what, dtype, dtype_subset):
     else:
         raise TypeError("unsupported dtype specified")
 
-    return ReductionKernel(dtype,
-            neutral=get_minmax_neutral(what, dtype),
-            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
-            map_expr="in[lookup_tbl[i]]",
-            arguments="const %(tp_lut)s *lookup_tbl, "
-            "const %(tp)s *in"  % {
+    return ReductionKernel(
+        dtype,
+        neutral=get_minmax_neutral(what, dtype),
+        reduce_expr=f"{reduce_expr}",
+        map_expr="in[lookup_tbl[i]]",
+        arguments="const %(tp_lut)s *lookup_tbl, "
+        "const %(tp)s *in"
+        % {
             "tp": dtype_to_ctype(dtype),
             "tp_lut": dtype_to_ctype(dtype_subset),
-            }, preamble="#define MY_INFINITY (1./0)")
+        },
+        preamble="#define MY_INFINITY (1./0)",
+    )
diff --git a/pycuda/scan.py b/pycuda/scan.py
index e48217b72069054d026536ac61268d7ade53ae04..15e670ea16adbfb7939e5bafbef4f301b09aef15 100644
--- a/pycuda/scan.py
+++ b/pycuda/scan.py
@@ -1,16 +1,11 @@
 """Scan primitive."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-
 __copyright__ = """
 Copyright 2011 Andreas Kloeckner
 Copyright 2008-2011 NVIDIA Corporation
 """
 
 
-
 __license__ = """
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,8 +23,6 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/
 """
 
 
-
-
 import numpy as np
 
 import pycuda.driver as driver
@@ -40,9 +33,9 @@ import pycuda._mymako as mako
 from pycuda._cluda import CLUDA_PREAMBLE
 
 
-
-
-SHARED_PREAMBLE = CLUDA_PREAMBLE + """
+SHARED_PREAMBLE = (
+    CLUDA_PREAMBLE
+    + """
 #define WG_SIZE ${wg_size}
 #define SCAN_EXPR(a, b) ${scan_expr}
 
@@ -50,11 +43,12 @@ ${preamble}
 
 typedef ${scan_type} scan_type;
 """
+)
 
 
-
-
-SCAN_INTERVALS_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL//
+SCAN_INTERVALS_SOURCE = mako.template.Template(
+    SHARED_PREAMBLE
+    + """//CL//
 #define K ${wg_seq_batches}
 
 <%def name="make_group_scan(name, with_bounds_check)">
@@ -237,12 +231,13 @@ void ${name_prefix}_scan_intervals(
         group_results[GID_0] = output[interval_end - 1];
     }
 }
-""")
-
-
+"""
+)
 
 
-INCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL//
+INCLUSIVE_UPDATE_SOURCE = mako.template.Template(
+    SHARED_PREAMBLE
+    + """//CL//
 KERNEL
 REQD_WG_SIZE(WG_SIZE, 1, 1)
 void ${name_prefix}_final_update(
@@ -275,12 +270,13 @@ void ${name_prefix}_final_update(
         }
     }
 }
-""")
-
-
+"""
+)
 
 
-EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL//
+EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(
+    SHARED_PREAMBLE
+    + """//CL//
 KERNEL
 REQD_WG_SIZE(WG_SIZE, 1, 1)
 void ${name_prefix}_final_update(
@@ -337,15 +333,21 @@ void ${name_prefix}_final_update(
         local_barrier();
     }
 }
-""")
-
-
+"""
+)
 
 
-class _ScanKernelBase(object):
-    def __init__(self, dtype,
-            scan_expr, neutral=None,
-            name_prefix="scan", options=None, preamble="", devices=None):
+class _ScanKernelBase:
+    def __init__(
+        self,
+        dtype,
+        scan_expr,
+        neutral=None,
+        name_prefix="scan",
+        options=None,
+        preamble="",
+        devices=None,
+    ):
 
         if isinstance(self, ExclusiveScanKernel) and neutral is None:
             raise ValueError("neutral element is required for exclusive scan")
@@ -363,92 +365,111 @@ class _ScanKernelBase(object):
             name_prefix=name_prefix,
             scan_type=dtype_to_ctype(dtype),
             scan_expr=scan_expr,
-            neutral=neutral)
-
-        scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render(
-            wg_size=self.scan_wg_size,
-            wg_seq_batches=self.scan_wg_seq_batches,
-            **kw_values))
+            neutral=neutral,
+        )
+
+        scan_intervals_src = str(
+            SCAN_INTERVALS_SOURCE.render(
+                wg_size=self.scan_wg_size,
+                wg_seq_batches=self.scan_wg_seq_batches,
+                **kw_values
+            )
+        )
         scan_intervals_prg = SourceModule(
-                scan_intervals_src, options=options, no_extern_c=True)
+            scan_intervals_src, options=options, no_extern_c=True
+        )
         self.scan_intervals_knl = scan_intervals_prg.get_function(
-                name_prefix+"_scan_intervals")
+            name_prefix + "_scan_intervals"
+        )
         self.scan_intervals_knl.prepare("PIIPP")
 
-        final_update_src = str(self.final_update_tp.render(
-            wg_size=self.update_wg_size,
-            **kw_values))
+        final_update_src = str(
+            self.final_update_tp.render(wg_size=self.update_wg_size, **kw_values)
+        )
 
         final_update_prg = SourceModule(
-                final_update_src, options=options, no_extern_c=True)
+            final_update_src, options=options, no_extern_c=True
+        )
         self.final_update_knl = final_update_prg.get_function(
-                name_prefix+"_final_update")
+            name_prefix + "_final_update"
+        )
         self.final_update_knl.prepare("PIIP")
 
-    def __call__(self, input_ary, output_ary=None, allocator=None,
-            stream=None):
+    def __call__(self, input_ary, output_ary=None, allocator=None, stream=None):
         allocator = allocator or input_ary.allocator
 
         if output_ary is None:
             output_ary = input_ary
 
-        if isinstance(output_ary, (str, six.text_type)) and output_ary == "new":
+        if isinstance(output_ary, (str, str)) and output_ary == "new":
             output_ary = gpuarray.empty_like(input_ary, allocator=allocator)
 
         if input_ary.shape != output_ary.shape:
             raise ValueError("input and output must have the same shape")
 
         if not input_ary.flags.forc:
-            raise RuntimeError("ScanKernel cannot "
-                    "deal with non-contiguous arrays")
+            raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays")
 
-        n, = input_ary.shape
+        (n,) = input_ary.shape
 
         if not n:
             return output_ary
 
-        unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
+        unit_size = self.scan_wg_size * self.scan_wg_seq_batches
         dev = driver.Context.get_device()
-        max_groups = 3*dev.get_attribute(
-                driver.device_attribute.MULTIPROCESSOR_COUNT)
+        max_groups = 3 * dev.get_attribute(driver.device_attribute.MULTIPROCESSOR_COUNT)
 
         from pytools import uniform_interval_splitting
-        interval_size, num_groups = uniform_interval_splitting(
-                n, unit_size, max_groups);
 
-        block_results = allocator(self.dtype.itemsize*num_groups)
+        interval_size, num_groups = uniform_interval_splitting(n, unit_size, max_groups)
+
+        block_results = allocator(self.dtype.itemsize * num_groups)
         dummy_results = allocator(self.dtype.itemsize)
 
         # first level scan of interval (one interval per block)
         self.scan_intervals_knl.prepared_async_call(
-                (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
-                input_ary.gpudata,
-                n, interval_size,
-                output_ary.gpudata,
-                block_results)
+            (num_groups, 1),
+            (self.scan_wg_size, 1, 1),
+            stream,
+            input_ary.gpudata,
+            n,
+            interval_size,
+            output_ary.gpudata,
+            block_results,
+        )
 
         # second level inclusive scan of per-block results
         self.scan_intervals_knl.prepared_async_call(
-                (1,1), (self.scan_wg_size, 1, 1), stream,
-                block_results,
-                num_groups, interval_size,
-                block_results,
-                dummy_results)
+            (1, 1),
+            (self.scan_wg_size, 1, 1),
+            stream,
+            block_results,
+            num_groups,
+            interval_size,
+            block_results,
+            dummy_results,
+        )
 
         # update intervals with result of second level scan
         self.final_update_knl.prepared_async_call(
-                (num_groups, 1,), (self.update_wg_size, 1, 1), stream,
-                output_ary.gpudata,
-                n, interval_size,
-                block_results)
+            (
+                num_groups,
+                1,
+            ),
+            (self.update_wg_size, 1, 1),
+            stream,
+            output_ary.gpudata,
+            n,
+            interval_size,
+            block_results,
+        )
 
         return output_ary
 
 
-
-
 class InclusiveScanKernel(_ScanKernelBase):
     final_update_tp = INCLUSIVE_UPDATE_SOURCE
 
+
 class ExclusiveScanKernel(_ScanKernelBase):
     final_update_tp = EXCLUSIVE_UPDATE_SOURCE
diff --git a/pycuda/sparse/__init__.py b/pycuda/sparse/__init__.py
index 3bb8a67f96d60703b46119cfb367378842bec89f..0949981b69451fbb55ca1c8ed78af7f9a37457f6 100644
--- a/pycuda/sparse/__init__.py
+++ b/pycuda/sparse/__init__.py
@@ -1,4 +1,8 @@
 from __future__ import absolute_import
 from warnings import warn
-warn("pycuda.sparse is deprecated. and will be removed in 2015.x",
-        DeprecationWarning, stacklevel=2)
+
+warn(
+    "pycuda.sparse is deprecated. and will be removed in 2015.x",
+    DeprecationWarning,
+    stacklevel=2,
+)
diff --git a/pycuda/sparse/cg.py b/pycuda/sparse/cg.py
index c583af01e01eb5abd62d4647d8e4e181e90a47df..42148ecec971e0743b51c8309c85099cf9b86138 100644
--- a/pycuda/sparse/cg.py
+++ b/pycuda/sparse/cg.py
@@ -2,22 +2,21 @@ from __future__ import division
 from __future__ import absolute_import
 from pycuda.sparse.inner import AsyncInnerProduct
 from pytools import memoize_method
+import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
 
 import numpy as np
 
 
-
-
 class ConvergenceError(RuntimeError):
     pass
 
 
-
 class CGStateContainer:
     def __init__(self, operator, precon=None, pagelocked_allocator=None):
         if precon is None:
             from pycuda.sparse.operator import IdentityOperator
+
             precon = IdentityOperator(operator.dtype, operator.shape[0])
 
         self.operator = operator
@@ -28,23 +27,21 @@ class CGStateContainer:
     @memoize_method
     def make_lc2_kernel(self, dtype, a_is_gpu, b_is_gpu):
         from pycuda.elementwise import get_linear_combination_kernel
-        return get_linear_combination_kernel((
-                (a_is_gpu, dtype, dtype),
-                (b_is_gpu, dtype, dtype)
-                ), dtype)
+
+        return get_linear_combination_kernel(
+            ((a_is_gpu, dtype, dtype), (b_is_gpu, dtype, dtype)), dtype
+        )
 
     def lc2(self, a, x, b, y, out=None):
         if out is None:
-            out = gpuarray.empty(x.shape, dtype=x.dtype,
-                    allocator=x.allocator)
+            out = gpuarray.empty(x.shape, dtype=x.dtype, allocator=x.allocator)
 
         assert x.dtype == y.dtype == out.dtype
         a_is_gpu = isinstance(a, gpuarray.GPUArray)
         b_is_gpu = isinstance(b, gpuarray.GPUArray)
         assert x.shape == y.shape == out.shape
 
-        kernel, texrefs = self.make_lc2_kernel(
-                x.dtype, a_is_gpu, b_is_gpu)
+        kernel, texrefs = self.make_lc2_kernel(x.dtype, a_is_gpu, b_is_gpu)
 
         texrefs = texrefs[:]
 
@@ -76,25 +73,29 @@ class CGStateContainer:
     def guarded_div_kernel(self, dtype_x, dtype_y, dtype_z):
         from pycuda.elementwise import get_elwise_kernel
         from pycuda.tools import dtype_to_ctype
+
         return get_elwise_kernel(
-                "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
-                    "tp_x": dtype_to_ctype(dtype_x),
-                    "tp_y": dtype_to_ctype(dtype_y),
-                    "tp_z": dtype_to_ctype(dtype_z),
-                    },
-                "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])",
-                "divide")
+            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+            % {
+                "tp_x": dtype_to_ctype(dtype_x),
+                "tp_y": dtype_to_ctype(dtype_y),
+                "tp_z": dtype_to_ctype(dtype_z),
+            },
+            "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])",
+            "divide",
+        )
 
     def guarded_div(self, a, b):
         from pycuda.gpuarray import _get_common_dtype
+
         result = a._new_like_me(_get_common_dtype(a, b))
 
         assert a.shape == b.shape
 
         func = self.guarded_div_kernel(a.dtype, b.dtype, result.dtype)
-        func.prepared_async_call(a._grid, a._block, None,
-                a.gpudata, b.gpudata,
-                result.gpudata, a.mem_size)
+        func.prepared_async_call(
+            a._grid, a._block, None, a.gpudata, b.gpudata, result.gpudata, a.mem_size
+        )
 
         return result
 
@@ -110,8 +111,7 @@ class CGStateContainer:
         self.d = self.precon(self.residual)
 
         # grows at the end
-        delta = AsyncInnerProduct(self.residual, self.d,
-                self.pagelocked_allocator)
+        delta = AsyncInnerProduct(self.residual, self.d, self.pagelocked_allocator)
         self.real_delta_queue = [delta]
         self.delta = delta.gpu_result
 
@@ -128,15 +128,13 @@ class CGStateContainer:
         self.lc2(1, self.x, alpha, self.d, out=self.x)
 
         if compute_real_residual:
-            self.residual = self.lc2(
-                    1, self.rhs, -1, self.operator(self.x))
+            self.residual = self.lc2(1, self.rhs, -1, self.operator(self.x))
         else:
             self.lc2(1, self.residual, -alpha, q, out=self.residual)
 
         s = self.precon(self.residual)
         delta_old = self.delta
-        delta = AsyncInnerProduct(self.residual, s,
-                self.pagelocked_allocator)
+        delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator)
         self.delta = delta.gpu_result
         beta = self.guarded_div(self.delta, delta_old)
 
@@ -149,18 +147,15 @@ class CGStateContainer:
         check_interval = 20
 
         if max_iterations is None:
-            max_iterations = max(
-                    3*check_interval+1, 10 * self.operator.shape[0])
+            max_iterations = max(3 * check_interval + 1, 10 * self.operator.shape[0])
         real_resid_interval = min(self.operator.shape[0], 50)
 
         iterations = 0
         delta_0 = None
         while iterations < max_iterations:
-            compute_real_residual = \
-                    iterations % real_resid_interval == 0
+            compute_real_residual = iterations % real_resid_interval == 0
 
-            self.one_iteration(
-                    compute_real_residual=compute_real_residual)
+            self.one_iteration(compute_real_residual=compute_real_residual)
 
             if debug_callback is not None:
                 if compute_real_residual:
@@ -168,8 +163,9 @@ class CGStateContainer:
                 else:
                     what = "it"
 
-                debug_callback(what, iterations, self.x,
-                        self.residual, self.d, self.delta)
+                debug_callback(
+                    what, iterations, self.x, self.residual, self.d, self.delta
+                )
 
             # do often enough to allow AsyncInnerProduct
             # to progress through (polled) event chain
@@ -185,11 +181,16 @@ class CGStateContainer:
                     while i < len(rdq):
                         delta = rdq[i].get_host_result()
                         if delta is not None:
-                            if abs(delta) < tol*tol * abs(delta_0):
+                            if abs(delta) < tol * tol * abs(delta_0):
                                 if debug_callback is not None:
-                                    debug_callback("end", iterations,
-                                            self.x, self.residual,
-                                            self.d, self.delta)
+                                    debug_callback(
+                                        "end",
+                                        iterations,
+                                        self.x,
+                                        self.residual,
+                                        self.d,
+                                        self.delta,
+                                    )
                                 return self.x
                             rdq.pop(i)
                         else:
@@ -200,26 +201,33 @@ class CGStateContainer:
         raise ConvergenceError("cg failed to converge")
 
 
-
-
-def solve_pkt_with_cg(pkt_spmv, b, precon=None, x=None, tol=1e-7, max_iterations=None,
-        debug=False, pagelocked_allocator=None):
+def solve_pkt_with_cg(
+    pkt_spmv,
+    b,
+    precon=None,
+    x=None,
+    tol=1e-7,
+    max_iterations=None,
+    debug=False,
+    pagelocked_allocator=None,
+):
     if x is None:
-        x = gpuarray.zeros(pkt_spmv.shape[0], dtype=pkt_spmv.dtype,
-                allocator=b.allocator)
+        x = gpuarray.zeros(
+            pkt_spmv.shape[0], dtype=pkt_spmv.dtype, allocator=b.allocator
+        )
     else:
         x = pkt_spmv.permute(x)
 
     if pagelocked_allocator is None:
         pagelocked_allocator = drv.pagelocked_empty
 
-    cg = CGStateContainer(pkt_spmv, precon,
-            pagelocked_allocator=pagelocked_allocator)
+    cg = CGStateContainer(pkt_spmv, precon, pagelocked_allocator=pagelocked_allocator)
 
     cg.reset(pkt_spmv.permute(b), x)
 
     it_count = [0]
     res_count = [0]
+
     def debug_callback(what, it_number, x, resid, d, delta):
         if what == "it":
             it_count[0] += 1
@@ -227,11 +235,6 @@ def solve_pkt_with_cg(pkt_spmv, b, precon=None, x=None, tol=1e-7, max_iterations
             res_count[0] += 1
             it_count[0] += 1
 
-    result = cg.run(max_iterations, tol,
-            debug_callback=debug_callback)
+    result = cg.run(max_iterations, tol, debug_callback=debug_callback)
 
     return pkt_spmv.unpermute(result), it_count[0], res_count[0]
-
-
-
-
diff --git a/pycuda/sparse/coordinate.py b/pycuda/sparse/coordinate.py
index bf068dd89af3012eeb98aaedbdca507433e9ab80..8299387f97f0c5f9e0a43ba8315b1e6705678387 100644
--- a/pycuda/sparse/coordinate.py
+++ b/pycuda/sparse/coordinate.py
@@ -7,8 +7,6 @@ from pycuda.compiler import SourceModule
 import numpy as np
 
 
-
-
 COO_FLAT_KERNEL_TEMPLATE = """
 #include <pycuda-helpers.hpp>
 
@@ -122,7 +120,6 @@ spmv_coo_flat_kernel(const index_type num_nonzeros,
 """
 
 
-
 COO_SERIAL_KERNEL_TEMPLATE = """
 typedef %(value_type)s value_type;
 typedef %(index_type)s index_type;
@@ -141,8 +138,6 @@ spmv_coo_serial_kernel(const index_type num_nonzeros,
 """
 
 
-
-
 class CoordinateSpMV:
     def __init__(self, mat, dtype):
         self.dtype = np.dtype(dtype)
@@ -152,6 +147,7 @@ class CoordinateSpMV:
         self.block_size = 128
 
         from scipy.sparse import coo_matrix
+
         coo_mat = coo_matrix(mat, dtype=self.dtype)
 
         self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype))
@@ -160,43 +156,49 @@ class CoordinateSpMV:
         self.nnz = coo_mat.nnz
 
         from pycuda.tools import DeviceData
+
         dev = drv.Context.get_device()
         devdata = DeviceData()
-        max_threads = (devdata.warps_per_mp*devdata.warp_size*
-                dev.multiprocessor_count)
-        max_blocks = 4*max_threads // self.block_size
+        max_threads = (
+            devdata.warps_per_mp * devdata.warp_size * dev.multiprocessor_count
+        )
+        max_blocks = 4 * max_threads // self.block_size
         warps_per_block = self.block_size // dev.warp_size
 
         if self.nnz:
+
             def divide_into(x, y):
-                return (x+y-1)//y
+                return (x + y - 1) // y
 
-            num_units  = self.nnz // dev.warp_size
-            num_warps  = min(num_units, warps_per_block * max_blocks)
+            num_units = self.nnz // dev.warp_size
+            num_warps = min(num_units, warps_per_block * max_blocks)
             self.num_blocks = divide_into(num_warps, warps_per_block)
-            num_iters  = divide_into(num_units, num_warps)
+            num_iters = divide_into(num_units, num_warps)
 
             self.interval_size = dev.warp_size * num_iters
             self.tail = num_units * dev.warp_size
 
-
     @memoize_method
     def get_flat_kernel(self):
         from pycuda.tools import dtype_to_ctype
 
         mod = SourceModule(
-                COO_FLAT_KERNEL_TEMPLATE % {
-                    "value_type": dtype_to_ctype(self.dtype),
-                    "tex_value_type": dtype_to_ctype(
-                        self.dtype, with_fp_tex_hack=True),
-                    "index_type": dtype_to_ctype(self.index_dtype),
-                    "block_size": self.block_size,
-                    "warp_size": drv.Context.get_device().warp_size,
-                    })
+            COO_FLAT_KERNEL_TEMPLATE
+            % {
+                "value_type": dtype_to_ctype(self.dtype),
+                "tex_value_type": dtype_to_ctype(self.dtype, with_fp_tex_hack=True),
+                "index_type": dtype_to_ctype(self.index_dtype),
+                "block_size": self.block_size,
+                "warp_size": drv.Context.get_device().warp_size,
+            }
+        )
         func = mod.get_function("spmv_coo_flat_kernel")
         x_texref = mod.get_texref("tex_x")
-        func.prepare(self.index_dtype.char*2 + "PPPP",
-            (self.block_size, 1, 1), texrefs=[x_texref])
+        func.prepare(
+            self.index_dtype.char * 2 + "PPPP",
+            (self.block_size, 1, 1),
+            texrefs=[x_texref],
+        )
         return func, x_texref
 
     @memoize_method
@@ -204,37 +206,43 @@ class CoordinateSpMV:
         from pycuda.tools import dtype_to_ctype
 
         mod = SourceModule(
-                COO_SERIAL_KERNEL_TEMPLATE % {
-                    "value_type": dtype_to_ctype(self.dtype),
-                    "index_type": dtype_to_ctype(self.index_dtype),
-                    })
+            COO_SERIAL_KERNEL_TEMPLATE
+            % {
+                "value_type": dtype_to_ctype(self.dtype),
+                "index_type": dtype_to_ctype(self.index_dtype),
+            }
+        )
         func = mod.get_function("spmv_coo_serial_kernel")
         func.prepare(self.index_dtype.char + "PPPPP", (1, 1, 1))
         return func
 
     def __call__(self, x, y=None):
         if y is None:
-            y = gpuarray.zeros(self.shape[0], dtype=self.dtype,
-                    allocator=x.allocator)
+            y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator)
 
         if self.nnz == 0:
             return y
 
         flat_func, x_texref = self.get_flat_kernel()
         x.bind_to_texref_ext(x_texref, allow_double_hack=True)
-        flat_func.prepared_call((self.num_blocks, 1),
-                self.tail, self.interval_size,
-                self.row_gpu.gpudata,
-                self.col_gpu.gpudata,
-                self.data_gpu.gpudata,
-                y.gpudata)
+        flat_func.prepared_call(
+            (self.num_blocks, 1),
+            self.tail,
+            self.interval_size,
+            self.row_gpu.gpudata,
+            self.col_gpu.gpudata,
+            self.data_gpu.gpudata,
+            y.gpudata,
+        )
 
         self.get_serial_kernel().prepared_call(
-                (1, 1),
-                self.nnz - self.tail,
-                self.row_gpu[self.tail:].gpudata,
-                self.col_gpu[self.tail:].gpudata,
-                self.data_gpu[self.tail:].gpudata,
-                x.gpudata, y.gpudata)
+            (1, 1),
+            self.nnz - self.tail,
+            self.row_gpu[self.tail:].gpudata,
+            self.col_gpu[self.tail:].gpudata,
+            self.data_gpu[self.tail:].gpudata,
+            x.gpudata,
+            y.gpudata,
+        )
 
         return y
diff --git a/pycuda/sparse/inner.py b/pycuda/sparse/inner.py
index c68405d6eb893ec9f66de720248c2141d0a6963d..7e55e3078130ecfca3c1592d2a621797eb18470b 100644
--- a/pycuda/sparse/inner.py
+++ b/pycuda/sparse/inner.py
@@ -3,14 +3,12 @@ from __future__ import absolute_import
 import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
 
-
+import atexit
 
 
 STREAM_POOL = []
 
 
-
-
 def get_stream():
     if STREAM_POOL:
         return STREAM_POOL.pop()
@@ -18,9 +16,6 @@ def get_stream():
         return drv.Stream()
 
 
-
-
-
 class AsyncInnerProduct:
     def __init__(self, a, b, pagelocked_allocator):
         self.gpu_result = gpuarray.dot(a, b)
@@ -36,11 +31,11 @@ class AsyncInnerProduct:
                 self.gpu_finished = True
                 self.copy_stream = get_stream()
                 self.host_dest = self.pagelocked_allocator(
-                        self.gpu_result.shape, self.gpu_result.dtype,
-                        self.copy_stream)
-                drv.memcpy_dtoh_async(self.host_dest,
-                        self.gpu_result.gpudata,
-                        self.copy_stream)
+                    self.gpu_result.shape, self.gpu_result.dtype, self.copy_stream
+                )
+                drv.memcpy_dtoh_async(
+                    self.host_dest, self.gpu_result.gpudata, self.copy_stream
+                )
                 self.copy_finished_evt = drv.Event()
                 self.copy_finished_evt.record()
         else:
@@ -49,11 +44,8 @@ class AsyncInnerProduct:
                 return self.host_dest
 
 
-
-
 def _at_exit():
     STREAM_POOL[:] = []
 
-import atexit
-atexit.register(_at_exit)
 
+atexit.register(_at_exit)
diff --git a/pycuda/sparse/operator.py b/pycuda/sparse/operator.py
index 529f7875f02e7ef63b96feebe511df1a5f6fc9c8..bf8231c29fabffbdc1a70035b27735d9ab7d0240 100644
--- a/pycuda/sparse/operator.py
+++ b/pycuda/sparse/operator.py
@@ -7,11 +7,6 @@ class OperatorBase(object):
     def shape(self):
         raise NotImplementedError
 
-    def __neg__(self):
-        return NegOperator(self)
-
-
-
 
 class IdentityOperator(OperatorBase):
     def __init__(self, dtype, n):
@@ -30,8 +25,6 @@ class IdentityOperator(OperatorBase):
         return operand
 
 
-
-
 class DiagonalPreconditioner(OperatorBase):
     def __init__(self, diagonal):
         self.diagonal = diagonal
@@ -46,8 +39,4 @@ class DiagonalPreconditioner(OperatorBase):
         return n, n
 
     def __call__(self, operand):
-        return self.diagonal*operand
-
-
-
-
+        return self.diagonal * operand
diff --git a/pycuda/sparse/packeted.py b/pycuda/sparse/packeted.py
index 973cb42c052b75f2a743417e1be98ecec2d376da..f2819db81bfc0be180c71beb68e802b4040b7c67 100644
--- a/pycuda/sparse/packeted.py
+++ b/pycuda/sparse/packeted.py
@@ -2,13 +2,9 @@ from __future__ import division
 from __future__ import absolute_import
 from __future__ import print_function
 from pytools import memoize_method
-import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
-from six.moves import range
-
-
 
 
 PKT_KERNEL_TEMPLATE = """
@@ -87,11 +83,10 @@ spmv_pkt_kernel(const index_type *row_ptr,
 """
 
 
-
-
 class PacketedSpMV:
     def __init__(self, mat, is_symmetric, dtype):
         from pycuda.tools import DeviceData
+
         devdata = DeviceData()
 
         # all row indices in the data structure generation code are
@@ -105,17 +100,19 @@ class PacketedSpMV:
         if h != w:
             raise ValueError("only square matrices are supported")
 
-        self.rows_per_packet = (devdata.shared_memory - 100) \
-                // (2*self.dtype.itemsize)
+        self.rows_per_packet = (devdata.shared_memory - 100) // (
+            2 * self.dtype.itemsize
+        )
 
-        self.block_count = \
-                (h + self.rows_per_packet - 1) // self.rows_per_packet
+        self.block_count = (h + self.rows_per_packet - 1) // self.rows_per_packet
 
         # get metis partition -------------------------------------------------
         from scipy.sparse import csr_matrix
+
         csr_mat = csr_matrix(mat, dtype=self.dtype)
 
         from pymetis import part_graph
+
         if not is_symmetric:
             # make sure adjacency graph is undirected
             adj_mat = csr_mat + csr_mat.T
@@ -123,8 +120,9 @@ class PacketedSpMV:
             adj_mat = csr_mat
 
         while True:
-            cut_count, dof_to_packet_nr = part_graph(int(self.block_count),
-                    xadj=adj_mat.indptr, adjncy=adj_mat.indices)
+            cut_count, dof_to_packet_nr = part_graph(
+                int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices
+            )
 
             # build packet_nr_to_dofs
             packet_nr_to_dofs = {}
@@ -136,8 +134,9 @@ class PacketedSpMV:
 
                 dof_packet.append(i)
 
-            packet_nr_to_dofs = [packet_nr_to_dofs.get(i)
-                    for i in range(len(packet_nr_to_dofs))]
+            packet_nr_to_dofs = [
+                packet_nr_to_dofs.get(i) for i in range(len(packet_nr_to_dofs))
+            ]
 
             too_big = False
             for packet_dofs in packet_nr_to_dofs:
@@ -147,10 +146,13 @@ class PacketedSpMV:
 
             if too_big:
                 old_block_count = self.block_count
-                self.block_count = int(2+1.05*self.block_count)
-                print(("Metis produced a big block at block count "
-                        "%d--retrying with %d"
-                        % (old_block_count, self.block_count)))
+                self.block_count = int(2 + 1.05 * self.block_count)
+                print(
+                    (
+                        "Metis produced a big block at block count "
+                        "%d--retrying with %d" % (old_block_count, self.block_count)
+                    )
+                )
                 continue
 
             break
@@ -158,51 +160,55 @@ class PacketedSpMV:
         assert len(packet_nr_to_dofs) == self.block_count
 
         # permutations, base rows ---------------------------------------------
-        new2old_fetch_indices, \
-                old2new_fetch_indices, \
-                packet_base_rows = self.find_simple_index_stuff(
-                        packet_nr_to_dofs)
+        (
+            new2old_fetch_indices,
+            old2new_fetch_indices,
+            packet_base_rows,
+        ) = self.find_simple_index_stuff(packet_nr_to_dofs)
 
         # find local row cost and remaining_coo -------------------------------
-        local_row_costs, remaining_coo = \
-                self.find_local_row_costs_and_remaining_coo(
-                        csr_mat, dof_to_packet_nr, old2new_fetch_indices)
+        local_row_costs, remaining_coo = self.find_local_row_costs_and_remaining_coo(
+            csr_mat, dof_to_packet_nr, old2new_fetch_indices
+        )
         local_nnz = np.sum(local_row_costs)
 
         assert remaining_coo.nnz == csr_mat.nnz - local_nnz
 
         # find thread assignment for each block -------------------------------
-        thread_count = len(packet_nr_to_dofs)*self.threads_per_packet
+        thread_count = len(packet_nr_to_dofs) * self.threads_per_packet
         thread_assignments, thread_costs = self.find_thread_assignment(
-                packet_nr_to_dofs, local_row_costs, thread_count)
+            packet_nr_to_dofs, local_row_costs, thread_count
+        )
 
         max_thread_costs = np.max(thread_costs)
 
         # build data structure ------------------------------------------------
         from .pkt_build import build_pkt_data_structure
-        build_pkt_data_structure(self, packet_nr_to_dofs, max_thread_costs,
-            old2new_fetch_indices, csr_mat, thread_count, thread_assignments,
-            local_row_costs)
+
+        build_pkt_data_structure(
+            self,
+            packet_nr_to_dofs,
+            max_thread_costs,
+            old2new_fetch_indices,
+            csr_mat,
+            thread_count,
+            thread_assignments,
+            local_row_costs,
+        )
 
         self.packet_base_rows = gpuarray.to_gpu(packet_base_rows)
-        self.new2old_fetch_indices = gpuarray.to_gpu(
-                new2old_fetch_indices)
-        self.old2new_fetch_indices = gpuarray.to_gpu(
-                old2new_fetch_indices)
+        self.new2old_fetch_indices = gpuarray.to_gpu(new2old_fetch_indices)
+        self.old2new_fetch_indices = gpuarray.to_gpu(old2new_fetch_indices)
 
         from .coordinate import CoordinateSpMV
-        self.remaining_coo_gpu = CoordinateSpMV(
-                remaining_coo, dtype)
+
+        self.remaining_coo_gpu = CoordinateSpMV(remaining_coo, dtype)
 
     def find_simple_index_stuff(self, packet_nr_to_dofs):
-        new2old_fetch_indices = np.zeros(
-                self.shape[0], dtype=self.index_dtype)
-        old2new_fetch_indices = np.zeros(
-                self.shape[0], dtype=self.index_dtype)
+        new2old_fetch_indices = np.zeros(self.shape[0], dtype=self.index_dtype)
+        old2new_fetch_indices = np.zeros(self.shape[0], dtype=self.index_dtype)
 
-        packet_base_rows = np.zeros(
-                self.block_count+1,
-                dtype=self.index_dtype)
+        packet_base_rows = np.zeros(self.block_count + 1, dtype=self.index_dtype)
 
         row_start = 0
         for packet_nr, packet in enumerate(packet_nr_to_dofs):
@@ -210,22 +216,22 @@ class PacketedSpMV:
             row_end = row_start + len(packet)
 
             pkt_indices = np.array(packet, dtype=self.index_dtype)
-            new2old_fetch_indices[row_start:row_end] = \
-                    pkt_indices
-            old2new_fetch_indices[pkt_indices] = \
-                    np.arange(row_start, row_end, dtype=self.index_dtype)
+            new2old_fetch_indices[row_start:row_end] = pkt_indices
+            old2new_fetch_indices[pkt_indices] = np.arange(
+                row_start, row_end, dtype=self.index_dtype
+            )
 
             row_start += len(packet)
 
         packet_base_rows[self.block_count] = row_start
 
-        return (new2old_fetch_indices, old2new_fetch_indices,
-                packet_base_rows)
+        return (new2old_fetch_indices, old2new_fetch_indices, packet_base_rows)
 
-    def find_local_row_costs_and_remaining_coo(self, csr_mat, dof_to_packet_nr,
-            old2new_fetch_indices):
+    def find_local_row_costs_and_remaining_coo(
+        self, csr_mat, dof_to_packet_nr, old2new_fetch_indices
+    ):
         h, w = self.shape
-        local_row_costs = [0]*h
+        local_row_costs = [0] * h
         rem_coo_values = []
         rem_coo_i = []
         rem_coo_j = []
@@ -235,7 +241,7 @@ class PacketedSpMV:
         data = csr_mat.data
 
         for i in range(h):
-            for idx in range(iptr[i], iptr[i+1]):
+            for idx in range(iptr[i], iptr[i + 1]):
                 j = indices[idx]
 
                 if dof_to_packet_nr[i] == dof_to_packet_nr[j]:
@@ -246,29 +252,29 @@ class PacketedSpMV:
                     rem_coo_j.append(old2new_fetch_indices[j])
 
         from scipy.sparse import coo_matrix
+
         remaining_coo = coo_matrix(
-                (rem_coo_values, (rem_coo_i, rem_coo_j)), self.shape,
-                dtype=self.dtype)
+            (rem_coo_values, (rem_coo_i, rem_coo_j)), self.shape, dtype=self.dtype
+        )
 
         return local_row_costs, remaining_coo
 
-    def find_thread_assignment(self, packet_nr_to_dofs, local_row_cost,
-            thread_count):
+    def find_thread_assignment(self, packet_nr_to_dofs, local_row_cost, thread_count):
         thread_assignments = [[] for i in range(thread_count)]
         thread_costs = np.zeros(thread_count)
 
         for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs):
             row_costs_and_numbers = sorted(
-                    [(local_row_cost[i], i) for i in packet_dofs],
-                    reverse=True)
+                [(local_row_cost[i], i) for i in packet_dofs], reverse=True
+            )
 
-            base_thread_nr = packet_nr*self.threads_per_packet
+            base_thread_nr = packet_nr * self.threads_per_packet
             thread_offset = 0
 
             # zigzag assignment
             step = 1
             for row_cost, row_number in row_costs_and_numbers:
-                ti = base_thread_nr+thread_offset
+                ti = base_thread_nr + thread_offset
                 thread_assignments[ti].append(row_number)
                 thread_costs[ti] += row_cost
 
@@ -281,16 +287,29 @@ class PacketedSpMV:
 
         return thread_assignments, thread_costs
 
-    def build_gpu_data_structure(self, packet_nr_to_dofs, max_thread_costs,
-            old2new_fetch_indices, csr_mat, thread_count, thread_assignments,
-            local_row_costs):
+    def build_gpu_data_structure(
+        self,
+        packet_nr_to_dofs,
+        max_thread_costs,
+        old2new_fetch_indices,
+        csr_mat,
+        thread_count,
+        thread_assignments,
+        local_row_costs,
+    ):
         # these arrays will likely be too long, but that's ok
 
         from .pkt_build import build_pkt_structure
-        build_pkt_structure(self, packet_nr_to_dofs, thread_assignments,
-                thread_starts, thread_ends, index_array, data_array)
-
 
+        build_pkt_structure(
+            self,
+            packet_nr_to_dofs,
+            thread_assignments,
+            # thread_starts,
+            # thread_ends,
+            # index_array,
+            # data_array,
+        )
 
         # copy data to the gpu ------------------------------------------------
 
@@ -300,13 +319,16 @@ class PacketedSpMV:
         from pycuda.tools import dtype_to_ctype
 
         mod = SourceModule(
-                PKT_KERNEL_TEMPLATE % {
-                    "value_type": dtype_to_ctype(self.dtype),
-                    "index_type": dtype_to_ctype(self.index_dtype),
-                    "packed_index_type": dtype_to_ctype(self.packed_index_dtype),
-                    "threads_per_packet": self.threads_per_packet,
-                    "rows_per_packet": self.rows_per_packet,
-                    }, no_extern_c=True)
+            PKT_KERNEL_TEMPLATE
+            % {
+                "value_type": dtype_to_ctype(self.dtype),
+                "index_type": dtype_to_ctype(self.index_dtype),
+                "packed_index_type": dtype_to_ctype(self.packed_index_dtype),
+                "threads_per_packet": self.threads_per_packet,
+                "rows_per_packet": self.rows_per_packet,
+            },
+            no_extern_c=True,
+        )
         func = mod.get_function("spmv_pkt_kernel")
         func.prepare("PPPPPPP")
         return func
@@ -319,21 +341,20 @@ class PacketedSpMV:
 
     def __call__(self, x, y=None):
         if y is None:
-            y = gpuarray.zeros(self.shape[0], dtype=self.dtype,
-                    allocator=x.allocator)
+            y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator)
 
         self.get_kernel().prepared_call(
-                (self.block_count, 1),
-                (self.threads_per_packet, 1, 1),
-                self.packet_base_rows.gpudata,
-                self.thread_starts.gpudata,
-                self.thread_ends.gpudata,
-                self.index_array.gpudata,
-                self.data_array.gpudata,
-                x.gpudata,
-                y.gpudata)
+            (self.block_count, 1),
+            (self.threads_per_packet, 1, 1),
+            self.packet_base_rows.gpudata,
+            self.thread_starts.gpudata,
+            self.thread_ends.gpudata,
+            self.index_array.gpudata,
+            self.data_array.gpudata,
+            x.gpudata,
+            y.gpudata,
+        )
 
         self.remaining_coo_gpu(x, y)
 
         return y
-
diff --git a/pycuda/sparse/pkt_build.py b/pycuda/sparse/pkt_build.py
index 278ca8edcd222c735f7ae9bda6a7e4204a7bb2c7..b3591584bd6b54f492a53e3da9fdc1ebca8b0e2e 100644
--- a/pycuda/sparse/pkt_build.py
+++ b/pycuda/sparse/pkt_build.py
@@ -1,44 +1,46 @@
 from __future__ import absolute_import
 import numpy as np
 import pycuda.gpuarray as gpuarray
-from six.moves import range
 
 
-
-
-def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs,
-        old2new_fetch_indices, csr_mat, thread_count, thread_assignments,
-        local_row_costs):
+def build_pkt_data_structure(
+    spmv,
+    packet_nr_to_dofs,
+    max_thread_costs,
+    old2new_fetch_indices,
+    csr_mat,
+    thread_count,
+    thread_assignments,
+    local_row_costs,
+):
     packet_start = 0
     base_dof_nr = 0
 
     max_thread_costs = int(max_thread_costs)
     index_array = np.zeros(
-            max_thread_costs*thread_count, dtype=spmv.packed_index_dtype)
-    data_array = np.zeros(
-            max_thread_costs*thread_count, dtype=spmv.dtype)
-    thread_starts = np.zeros(
-            thread_count, dtype=spmv.index_dtype)
-    thread_ends = np.zeros(
-            thread_count, dtype=spmv.index_dtype)
+        max_thread_costs * thread_count, dtype=spmv.packed_index_dtype
+    )
+    data_array = np.zeros(max_thread_costs * thread_count, dtype=spmv.dtype)
+    thread_starts = np.zeros(thread_count, dtype=spmv.index_dtype)
+    thread_ends = np.zeros(thread_count, dtype=spmv.index_dtype)
 
     for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs):
-        base_thread_nr = packet_nr*spmv.threads_per_packet
+        base_thread_nr = packet_nr * spmv.threads_per_packet
         max_packet_items = 0
 
         for thread_offset in range(spmv.threads_per_packet):
-            thread_write_idx = packet_start+thread_offset
-            thread_start = packet_start+thread_offset
-            thread_starts[base_thread_nr+thread_offset] = thread_write_idx
+            thread_write_idx = packet_start + thread_offset
+            thread_start = packet_start + thread_offset
+            thread_starts[base_thread_nr + thread_offset] = thread_write_idx
 
-            for row_nr in thread_assignments[base_thread_nr+thread_offset]:
+            for row_nr in thread_assignments[base_thread_nr + thread_offset]:
                 perm_row_nr = old2new_fetch_indices[row_nr]
                 rel_row_nr = perm_row_nr - base_dof_nr
                 assert 0 <= rel_row_nr < len(packet_dofs)
 
                 row_entries = 0
 
-                for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr+1]):
+                for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr + 1]):
                     col_nr = csr_mat.indices[idx]
 
                     perm_col_nr = old2new_fetch_indices[col_nr]
@@ -52,14 +54,13 @@ def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs,
 
                 assert row_entries == local_row_costs[row_nr]
 
-            thread_ends[base_thread_nr+thread_offset] = thread_write_idx
+            thread_ends[base_thread_nr + thread_offset] = thread_write_idx
 
-            thread_items = (thread_write_idx - thread_start)//spmv.threads_per_packet
-            max_packet_items = max(
-                    max_packet_items, thread_items)
+            thread_items = (thread_write_idx - thread_start) // spmv.threads_per_packet
+            max_packet_items = max(max_packet_items, thread_items)
 
         base_dof_nr += len(packet_dofs)
-        packet_start += max_packet_items*spmv.threads_per_packet
+        packet_start += max_packet_items * spmv.threads_per_packet
 
     spmv.thread_starts = gpuarray.to_gpu(thread_starts)
     spmv.thread_ends = gpuarray.to_gpu(thread_ends)
@@ -67,12 +68,10 @@ def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs,
     spmv.data_array = gpuarray.to_gpu(data_array)
 
 
-
-
 try:
     import pyximport
 except ImportError:
     pass
 else:
     pyximport.install()
-    from pycuda.sparse.pkt_build_cython import build_pkt_data_structure
+    from pycuda.sparse.pkt_build_cython import build_pkt_data_structure  # noqa: F811, F401
diff --git a/pycuda/tools.py b/pycuda/tools.py
index c243b911bf9a4331d358364c5bfac84b647aebce..1a2b50f4dd786628a5721b98f01accab013bd875 100644
--- a/pycuda/tools.py
+++ b/pycuda/tools.py
@@ -1,11 +1,5 @@
 """Miscallenous helper functionality."""
 
-from __future__ import division, print_function
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import input
-
 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
 
 __license__ = """
@@ -37,15 +31,18 @@ import pycuda._driver as _drv
 import numpy as np
 
 
+from pycuda.compyte.dtypes import (  # noqa: F401
+    register_dtype,
+    get_or_register_dtype,
+    _fill_dtype_registry,
+    dtype_to_ctype as base_dtype_to_ctype,
+)
+
 bitlog2 = _drv.bitlog2
 DeviceMemoryPool = _drv.DeviceMemoryPool
 PageLockedMemoryPool = _drv.PageLockedMemoryPool
 PageLockedAllocator = _drv.PageLockedAllocator
 
-from pycuda.compyte.dtypes import (
-        register_dtype, get_or_register_dtype, _fill_dtype_registry,
-        dtype_to_ctype as base_dtype_to_ctype)
-
 _fill_dtype_registry(respect_windows=True)
 get_or_register_dtype("pycuda::complex<float>", np.complex64)
 get_or_register_dtype("pycuda::complex<double>", np.complex128)
@@ -53,6 +50,7 @@ get_or_register_dtype("pycuda::complex<double>", np.complex128)
 
 # {{{ debug memory pool
 
+
 class DebugMemoryPool(DeviceMemoryPool):
     def __init__(self, interactive=True, logfile=None):
         DeviceMemoryPool.__init__(self)
@@ -61,42 +59,52 @@ class DebugMemoryPool(DeviceMemoryPool):
 
         if logfile is None:
             import sys
+
             logfile = sys.stdout
 
         self.logfile = logfile
 
         from weakref import WeakKeyDictionary
+
         self.blocks = WeakKeyDictionary()
 
         if interactive:
             from pytools.diskdict import DiskDict
+
             self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics")
 
     def allocate(self, size):
         from traceback import extract_stack
+
         stack = tuple(frm[2] for frm in extract_stack())
         description = self.describe(stack, size)
 
         histogram = {}
-        for bsize, descr in six.itervalues(self.blocks):
+        for bsize, descr in self.blocks.values():
             histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1
 
         from pytools import common_prefix
+
         cpfx = common_prefix(descr for bsize, descr in histogram)
 
         print(
-                "\n  Allocation of size %d occurring "
-                "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):"
-                "\n      at: %s" % (
-                    (size, self.last_free) + cuda.mem_get_info()
-                    + (self.held_blocks, self.active_blocks,
-                    description)),
-                file=self.logfile)
-
-        hist_items = sorted(list(six.iteritems(histogram)))
+            "\n  Allocation of size %d occurring "
+            "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):"
+            "\n      at: %s"
+            % (
+                (size, self.last_free)
+                + cuda.mem_get_info()
+                + (self.held_blocks, self.active_blocks, description)
+            ),
+            file=self.logfile,
+        )
+
+        hist_items = sorted(list(histogram.items()))
         for (bsize, descr), count in hist_items:
-            print("  %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count),
-                    file=self.logfile)
+            print(
+                "  %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count),
+                file=self.logfile,
+            )
 
         if self.interactive:
             input("  [Enter]")
@@ -116,33 +124,43 @@ class DebugMemoryPool(DeviceMemoryPool):
                 print(size, stack)
                 while True:
                     mnemonic = input("Enter mnemonic or [Enter] for more info:")
-                    if mnemonic == '':
+                    if mnemonic == "":
                         from traceback import print_stack
+
                         print_stack()
                     else:
                         break
                 self.stacktrace_mnemonics[stack, size] = mnemonic
                 return mnemonic
 
+
 # }}}
 
 
 # {{{ default device/context
 
+
 def get_default_device(default=0):
     from warnings import warn
-    warn("get_default_device() is deprecated; "
-            "use make_default_context() instead", DeprecationWarning)
+
+    warn(
+        "get_default_device() is deprecated; " "use make_default_context() instead",
+        DeprecationWarning,
+    )
 
     from pycuda.driver import Device
     import os
+
     dev = os.environ.get("CUDA_DEVICE")
 
     if dev is None:
         try:
-            dev = (open(os.path.join(os.path.expanduser("~"), ".cuda_device"))
-                    .read().strip())
-        except:
+            dev = (
+                open(os.path.join(os.path.expanduser("~"), ".cuda_device"))
+                .read()
+                .strip()
+            )
+        except Exception:
             pass
 
     if dev is None:
@@ -151,24 +169,28 @@ def get_default_device(default=0):
     try:
         dev = int(dev)
     except TypeError:
-        raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda-device) "
-                "must be an integer")
+        raise TypeError(
+            "CUDA device number (CUDA_DEVICE or ~/.cuda-device) " "must be an integer"
+        )
 
     return Device(dev)
 
 
 def make_default_context(ctx_maker=None):
     if ctx_maker is None:
+
         def ctx_maker(dev):
             return dev.make_context()
 
     ndevices = cuda.Device.count()
     if ndevices == 0:
-        raise RuntimeError("No CUDA enabled device found. "
-                "Please check your installation.")
+        raise RuntimeError(
+            "No CUDA enabled device found. " "Please check your installation."
+        )
 
     # Is CUDA_DEVICE set?
     import os
+
     devn = os.environ.get("CUDA_DEVICE")
 
     # Is $HOME/.cuda_device set ?
@@ -176,9 +198,8 @@ def make_default_context(ctx_maker=None):
         try:
             homedir = os.environ.get("HOME")
             assert homedir is not None
-            devn = (open(os.path.join(homedir, ".cuda_device"))
-                    .read().strip())
-        except:
+            devn = open(os.path.join(homedir, ".cuda_device")).read().strip()
+        except Exception:
             pass
 
     # If either CUDA_DEVICE or $HOME/.cuda_device is set, try to use it
@@ -186,8 +207,10 @@ def make_default_context(ctx_maker=None):
         try:
             devn = int(devn)
         except TypeError:
-            raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda_device)"
-                    " must be an integer")
+            raise TypeError(
+                "CUDA device number (CUDA_DEVICE or ~/.cuda_device)"
+                " must be an integer"
+            )
 
         dev = cuda.Device(devn)
         return ctx_maker(dev)
@@ -201,14 +224,18 @@ def make_default_context(ctx_maker=None):
             except cuda.Error:
                 pass
 
-        raise RuntimeError("make_default_context() wasn't able to create a context "
-                "on any of the %d detected devices" % ndevices)
+        raise RuntimeError(
+            "make_default_context() wasn't able to create a context "
+            "on any of the %d detected devices" % ndevices
+        )
+
 
 # }}}
 
 
 # {{{ rounding helpers
 
+
 def _exact_div(dividend, divisor):
     quot, rem = divmod(dividend, divisor)
     assert rem == 0
@@ -220,7 +247,8 @@ def _int_ceiling(value, multiple_of=1):
     # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
 
     from math import ceil
-    return int(ceil(value/multiple_of))*multiple_of
+
+    return int(ceil(value / multiple_of)) * multiple_of
 
 
 def _int_floor(value, multiple_of=1):
@@ -228,13 +256,16 @@ def _int_floor(value, multiple_of=1):
     # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
 
     from math import floor
-    return int(floor(value/multiple_of))*multiple_of
+
+    return int(floor(value / multiple_of)) * multiple_of
+
 
 # }}}
 
 
 # {{{ device data
 
+
 class DeviceData:
     def __init__(self, dev=None):
         import pycuda.driver as drv
@@ -242,8 +273,7 @@ class DeviceData:
         if dev is None:
             dev = cuda.Context.get_device()
 
-        self.max_threads = dev.get_attribute(
-                drv.device_attribute.MAX_THREADS_PER_BLOCK)
+        self.max_threads = dev.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK)
         self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE)
 
         if dev.compute_capability() >= (3, 0):
@@ -256,10 +286,10 @@ class DeviceData:
             self.warps_per_mp = 24
 
         self.thread_blocks_per_mp = 8
-        self.registers = dev.get_attribute(
-                drv.device_attribute.MAX_REGISTERS_PER_BLOCK)
+        self.registers = dev.get_attribute(drv.device_attribute.MAX_REGISTERS_PER_BLOCK)
         self.shared_memory = dev.get_attribute(
-                drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)
+            drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK
+        )
 
         if dev.compute_capability() >= (2, 0):
             self.smem_alloc_granularity = 128
@@ -277,8 +307,7 @@ class DeviceData:
         return _int_ceiling(bytes, self.align_bytes(word_size))
 
     def align_dtype(self, elements, dtype_size):
-        return _int_ceiling(elements,
-                self.align_words(dtype_size))
+        return _int_ceiling(elements, self.align_words(dtype_size))
 
     def align_words(self, word_size):
         return _exact_div(self.align_bytes(word_size), word_size)
@@ -298,31 +327,35 @@ class DeviceData:
 
     @staticmethod
     def make_valid_tex_channel_count(size):
-        valid_sizes = [1,2,4]
+        valid_sizes = [1, 2, 4]
         for vs in valid_sizes:
             if size <= vs:
                 return vs
 
         raise ValueError("could not enlarge argument to valid channel count")
 
+
 # }}}
 
 # {{{ occupancy
 
+
 class OccupancyRecord:
     def __init__(self, devdata, threads, shared_mem=0, registers=0):
         if threads > devdata.max_threads:
             raise ValueError("too many threads")
 
         # copied literally from occupancy calculator
-        alloc_warps = _int_ceiling(threads/devdata.warp_size)
+        alloc_warps = _int_ceiling(threads / devdata.warp_size)
         alloc_smem = _int_ceiling(shared_mem, devdata.smem_alloc_granularity)
         if devdata.register_allocation_unit == "warp":
-            alloc_regs = alloc_warps*32*registers
+            alloc_regs = alloc_warps * 32 * registers
         elif devdata.register_allocation_unit == "block":
-            alloc_regs = _int_ceiling(alloc_warps*2, 4)*16*registers
+            alloc_regs = _int_ceiling(alloc_warps * 2, 4) * 16 * registers
         else:
-            raise ValueError("Improper register allocation unit:"+devdata.register_allocation_unit)
+            raise ValueError(
+                "Improper register allocation unit:" + devdata.register_allocation_unit
+            )
 
         if alloc_regs > devdata.registers:
             raise ValueError("too many registers")
@@ -330,33 +363,37 @@ class OccupancyRecord:
         if alloc_smem > devdata.shared_memory:
             raise ValueError("too much smem")
 
-        self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"),
-                (_int_floor(devdata.warps_per_mp/alloc_warps), "warps")
-                ]
+        self.tb_per_mp_limits = [
+            (devdata.thread_blocks_per_mp, "device"),
+            (_int_floor(devdata.warps_per_mp / alloc_warps), "warps"),
+        ]
         if registers > 0:
-            self.tb_per_mp_limits.append((_int_floor(devdata.registers/alloc_regs), "regs"))
+            self.tb_per_mp_limits.append(
+                (_int_floor(devdata.registers / alloc_regs), "regs")
+            )
         if shared_mem > 0:
-            self.tb_per_mp_limits.append((_int_floor(devdata.shared_memory/alloc_smem), "smem"))
+            self.tb_per_mp_limits.append(
+                (_int_floor(devdata.shared_memory / alloc_smem), "smem")
+            )
 
         self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits)
 
         self.warps_per_mp = self.tb_per_mp * alloc_warps
         self.occupancy = self.warps_per_mp / devdata.warps_per_mp
 
+
 # }}}
 
 # {{{ C types <-> dtypes
 
+
 class Argument:
     def __init__(self, dtype, name):
         self.dtype = np.dtype(dtype)
         self.name = name
 
     def __repr__(self):
-        return "%s(%r, %s)" % (
-                self.__class__.__name__,
-                self.name,
-                self.dtype)
+        return f"{self.__class__.__name__}({self.name!r}, {self.dtype})"
 
 
 def dtype_to_ctype(dtype, with_fp_tex_hack=False):
@@ -379,13 +416,14 @@ def dtype_to_ctype(dtype, with_fp_tex_hack=False):
 
 class VectorArg(Argument):
     def declarator(self):
-        return "%s *%s" % (dtype_to_ctype(self.dtype), self.name)
+        return "{} *{}".format(dtype_to_ctype(self.dtype), self.name)
 
     struct_char = "P"
 
+
 class ScalarArg(Argument):
     def declarator(self):
-        return "%s %s" % (dtype_to_ctype(self.dtype), self.name)
+        return "{} {}".format(dtype_to_ctype(self.dtype), self.name)
 
     @property
     def struct_char(self):
@@ -396,15 +434,16 @@ class ScalarArg(Argument):
         return result
 
 
-
-
 def parse_c_arg(c_arg):
     from pycuda.compyte.dtypes import parse_c_arg_backend
+
     return parse_c_arg_backend(c_arg, ScalarArg, VectorArg)
 
+
 def get_arg_type(c_arg):
     return parse_c_arg(c_arg).struct_char
 
+
 # }}}
 
 # {{{ context-dep memoization
@@ -412,8 +451,6 @@ def get_arg_type(c_arg):
 context_dependent_memoized_functions = []
 
 
-
-
 @decorator
 def context_dependent_memoize(func, *args):
     try:
@@ -435,7 +472,6 @@ def context_dependent_memoize(func, *args):
         return result
 
 
-
 def clear_context_caches():
     for func in context_dependent_memoized_functions:
         try:
@@ -445,13 +481,16 @@ def clear_context_caches():
         else:
             ctx_dict.clear()
 
+
 # }}}
 
 # {{{ py.test interaction
 
+
 def mark_cuda_test(inner_f):
     def f(*args, **kwargs):
         import pycuda.driver
+
         # appears to be idempotent, i.e. no harm in calling it more than once
         pycuda.driver.init()
 
@@ -465,9 +504,11 @@ def mark_cuda_test(inner_f):
             ctx.pop()
 
             from pycuda.tools import clear_context_caches
+
             clear_context_caches()
 
             from gc import collect
+
             collect()
 
     try:
@@ -477,8 +518,8 @@ def mark_cuda_test(inner_f):
 
     return mark_test.cuda(f)
 
-# }}}
 
+# }}}
 
 
 # vim: foldmethod=marker
diff --git a/setup.cfg b/setup.cfg
index b4f38af601d5948c50e30d14477373a40da6eea6..291835073f52a47141bc239a0be603bc742d7d39 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,9 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,W503
+ignore = E126,E127,E128,E123,E226,E241,E242,W503,N806,F405,E501
+# FIXME: fix in future: N806, F405, E501
 max-line-length=85
+exclude=pycuda/compyte/ndarray,pycuda/compyte/array.py
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
diff --git a/setup.py b/setup.py
index febe2f864f7080de300228943f7d00c4c4c1429b..2a7acbe3a5ec6d0355dd21fd59e2ecbba3141224 100644
--- a/setup.py
+++ b/setup.py
@@ -22,9 +22,17 @@ def search_on_path(filenames):
 
 
 def get_config_schema():
-    from aksetup_helper import (ConfigSchema, Option,
-            IncludeDir, LibraryDir, Libraries, BoostLibraries,
-            Switch, StringListOption, make_boost_base_options)
+    from aksetup_helper import (
+        ConfigSchema,
+        Option,
+        IncludeDir,
+        LibraryDir,
+        Libraries,
+        BoostLibraries,
+        Switch,
+        StringListOption,
+        make_boost_base_options,
+    )
 
     nvcc_path = search_on_path(["nvcc", "nvcc.exe"])
     if nvcc_path is None:
@@ -41,69 +49,79 @@ def get_config_schema():
 
     lib64 = "lib64"
     import sys
+
     if sys.platform.startswith("win"):
         # https://github.com/inducer/pycuda/issues/113
         lib64 = "lib/x64"
 
-        cxxflags_default.extend(['/EHsc'])
-        ldflags_default.extend(['/FORCE'])
-    elif 'darwin' in sys.platform:
+        cxxflags_default.extend(["/EHsc"])
+        ldflags_default.extend(["/FORCE"])
+    elif "darwin" in sys.platform:
         import glob
-        root_candidates = glob.glob('/Developer/NVIDIA/CUDA-*')
+
+        root_candidates = glob.glob("/Developer/NVIDIA/CUDA-*")
         if root_candidates:
             cuda_root_default = root_candidates[-1]
             lib64 = "lib"
 
     default_lib_dirs = [
         "${CUDA_ROOT}/lib",
-        "${CUDA_ROOT}/"+lib64,
+        "${CUDA_ROOT}/" + lib64,
         # https://github.com/inducer/pycuda/issues/98
         "${CUDA_ROOT}/lib/stubs",
         "${CUDA_ROOT}/%s/stubs" % lib64,
+    ]
+
+    if "darwin" in sys.platform:
+        default_lib_dirs.append("/usr/local/cuda/lib")
+
+    return ConfigSchema(
+        make_boost_base_options()
+        + [
+            Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"),
+            BoostLibraries("python"),
+            BoostLibraries("thread"),
+            Switch("CUDA_TRACE", False, "Enable CUDA API tracing"),
+            Option(
+                "CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit"
+            ),
+            Option(
+                "CUDA_PRETEND_VERSION",
+                help="Assumed CUDA version, in the form 3010 for 3.1.",
+            ),
+            IncludeDir("CUDA", None),
+            Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"),
+            Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"),
+            LibraryDir("CUDADRV", default_lib_dirs),
+            Libraries("CUDADRV", ["cuda"]),
+            LibraryDir("CUDART", default_lib_dirs),
+            Libraries("CUDART", ["cudart"]),
+            LibraryDir("CURAND", default_lib_dirs),
+            Libraries("CURAND", ["curand"]),
+            StringListOption(
+                "CXXFLAGS",
+                cxxflags_default,
+                help="Any extra C++ compiler options to include",
+            ),
+            StringListOption(
+                "LDFLAGS", ldflags_default, help="Any extra linker options to include"
+            ),
         ]
-
-    if 'darwin' in sys.platform:
-        default_lib_dirs.append(
-            "/usr/local/cuda/lib")
-
-    return ConfigSchema(make_boost_base_options() + [
-        Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"),
-
-        BoostLibraries("python"),
-        BoostLibraries("thread"),
-
-        Switch("CUDA_TRACE", False, "Enable CUDA API tracing"),
-        Option("CUDA_ROOT", default=cuda_root_default,
-            help="Path to the CUDA toolkit"),
-        Option("CUDA_PRETEND_VERSION",
-            help="Assumed CUDA version, in the form 3010 for 3.1."),
-        IncludeDir("CUDA", None),
-
-        Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"),
-        Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"),
-
-        LibraryDir("CUDADRV", default_lib_dirs),
-        Libraries("CUDADRV", ["cuda"]),
-
-        LibraryDir("CUDART", default_lib_dirs),
-        Libraries("CUDART", ["cudart"]),
-
-        LibraryDir("CURAND", default_lib_dirs),
-        Libraries("CURAND", ["curand"]),
-
-        StringListOption("CXXFLAGS", cxxflags_default,
-            help="Any extra C++ compiler options to include"),
-        StringListOption("LDFLAGS", ldflags_default,
-            help="Any extra linker options to include"),
-        ])
+    )
 
 
 def main():
     import sys
 
-    from aksetup_helper import (hack_distutils, get_config, setup,
-            ExtensionUsingNumpy, set_up_shipped_boost_if_requested,
-            check_git_submodules, NumpyBuildExtCommand)
+    from aksetup_helper import (
+        hack_distutils,
+        get_config,
+        setup,
+        ExtensionUsingNumpy,
+        set_up_shipped_boost_if_requested,
+        check_git_submodules,
+        NumpyBuildExtCommand,
+    )
 
     check_git_submodules()
 
@@ -116,8 +134,11 @@ def main():
     EXTRA_DEFINES["PYGPU_PYCUDA"] = "1"
 
     LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
-    LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"]
-            + conf["CUDADRV_LIBNAME"])
+    LIBRARIES = (
+        conf["BOOST_PYTHON_LIBNAME"]
+        + conf["BOOST_THREAD_LIBNAME"]
+        + conf["CUDADRV_LIBNAME"]
+    )
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
@@ -128,20 +149,20 @@ def main():
     if conf["CUDA_PRETEND_VERSION"]:
         EXTRA_DEFINES["CUDAPP_PRETEND_CUDA_VERSION"] = conf["CUDA_PRETEND_VERSION"]
 
-    INCLUDE_DIRS = ['src/cpp'] + conf["BOOST_INC_DIR"]
+    INCLUDE_DIRS = ["src/cpp"] + conf["BOOST_INC_DIR"]
     if conf["CUDA_INC_DIR"]:
         INCLUDE_DIRS += conf["CUDA_INC_DIR"]
 
     conf["USE_CUDA"] = True
 
-    if 'darwin' in sys.platform and sys.maxsize == 2147483647:
+    if "darwin" in sys.platform and sys.maxsize == 2147483647:
         # The Python interpreter is running in 32 bit mode on OS X
         if "-arch" not in conf["CXXFLAGS"]:
-            conf["CXXFLAGS"].extend(['-arch', 'i386', '-m32'])
+            conf["CXXFLAGS"].extend(["-arch", "i386", "-m32"])
         if "-arch" not in conf["LDFLAGS"]:
-            conf["LDFLAGS"].extend(['-arch', 'i386', '-m32'])
+            conf["LDFLAGS"].extend(["-arch", "i386", "-m32"])
 
-    if 'darwin' in sys.platform:
+    if "darwin" in sys.platform:
         # set path to Cuda dynamic libraries,
         # as a safe substitute for DYLD_LIBRARY_PATH
         for lib_dir in conf["CUDADRV_LIB_DIR"]:
@@ -153,103 +174,103 @@ def main():
 
     if conf["CUDA_ENABLE_CURAND"]:
         EXTRA_DEFINES["HAVE_CURAND"] = 1
-        EXTRA_SOURCES.extend([
-            "src/wrapper/wrap_curand.cpp"
-            ])
+        EXTRA_SOURCES.extend(["src/wrapper/wrap_curand.cpp"])
         LIBRARIES.extend(conf["CURAND_LIBNAME"])
         LIBRARY_DIRS.extend(conf["CURAND_LIB_DIR"])
 
     ver_dic = {}
-    exec(compile(open("pycuda/__init__.py").read(), "pycuda/__init__.py", 'exec'),
-            ver_dic)
+    exec(
+        compile(open("pycuda/__init__.py").read(), "pycuda/__init__.py", "exec"),
+        ver_dic,
+    )
 
     import sys
+
     if sys.version_info >= (3,):
         pvt_struct_source = "src/wrapper/_pvt_struct_v3.cpp"
     else:
         pvt_struct_source = "src/wrapper/_pvt_struct_v2.cpp"
 
-    setup(name="pycuda",
-            # metadata
-            version=ver_dic["VERSION_TEXT"],
-            description="Python wrapper for Nvidia CUDA",
-            long_description=open("README.rst", "rt").read(),
-            author="Andreas Kloeckner",
-            author_email="inform@tiker.net",
-            license="MIT",
-            url="http://mathema.tician.de/software/pycuda",
-            classifiers=[
-                'Environment :: Console',
-                'Development Status :: 5 - Production/Stable',
-                'Intended Audience :: Developers',
-                'Intended Audience :: Other Audience',
-                'Intended Audience :: Science/Research',
-                'License :: OSI Approved :: MIT License',
-                'Natural Language :: English',
-                'Programming Language :: C++',
-                'Programming Language :: Python',
-                'Programming Language :: Python :: 3',
-                'Programming Language :: Python :: 2.6',
-                'Programming Language :: Python :: 2.7',
-                'Programming Language :: Python :: 3.3',
-                'Programming Language :: Python :: 3.4',
-                'Topic :: Scientific/Engineering',
-                'Topic :: Scientific/Engineering :: Mathematics',
-                'Topic :: Scientific/Engineering :: Physics',
-                'Topic :: Scientific/Engineering :: Visualization',
-                ],
-
-            # build info
-            packages=["pycuda", "pycuda.gl", "pycuda.sparse", "pycuda.compyte"],
-
-            setup_requires=[
-                "numpy>=1.6",
-                ],
-
-            python_requires="~=3.6",
-            install_requires=[
-                "pytools>=2011.2",
-                "decorator>=3.2.0",
-                "appdirs>=1.4.0",
-                "mako",
-                ],
-
-            test_requires=[
-                "pytest>=2",
-                ],
-
-            ext_package="pycuda",
-            ext_modules=[
-                ExtensionUsingNumpy("_driver",
-                    [
-                        "src/cpp/cuda.cpp",
-                        "src/cpp/bitlog.cpp",
-                        "src/wrapper/wrap_cudadrv.cpp",
-                        "src/wrapper/mempool.cpp",
-                        ]+EXTRA_SOURCES,
-                    include_dirs=INCLUDE_DIRS,
-                    library_dirs=LIBRARY_DIRS,
-                    libraries=LIBRARIES,
-                    define_macros=list(EXTRA_DEFINES.items()),
-                    extra_compile_args=conf["CXXFLAGS"],
-                    extra_link_args=conf["LDFLAGS"],
-                    ),
-                ExtensionUsingNumpy("_pvt_struct",
-                    [pvt_struct_source],
-                    extra_compile_args=conf["CXXFLAGS"],
-                    extra_link_args=conf["LDFLAGS"],
-                    ),
-                ],
-            cmdclass={'build_ext': NumpyBuildExtCommand},
-            include_package_data=True,
-            package_data={
-                    "pycuda": [
-                        "cuda/*.hpp",
-                        ]
-                    },
-
-            zip_safe=False)
-
-
-if __name__ == '__main__':
+    setup(
+        name="pycuda",
+        # metadata
+        version=ver_dic["VERSION_TEXT"],
+        description="Python wrapper for Nvidia CUDA",
+        long_description=open("README.rst", "rt").read(),
+        author="Andreas Kloeckner",
+        author_email="inform@tiker.net",
+        license="MIT",
+        url="http://mathema.tician.de/software/pycuda",
+        classifiers=[
+            "Environment :: Console",
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "Intended Audience :: Other Audience",
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: MIT License",
+            "Natural Language :: English",
+            "Programming Language :: C++",
+            "Programming Language :: Python",
+            "Programming Language :: Python :: 3",
+            "Programming Language :: Python :: 2.6",
+            "Programming Language :: Python :: 2.7",
+            "Programming Language :: Python :: 3.3",
+            "Programming Language :: Python :: 3.4",
+            "Topic :: Scientific/Engineering",
+            "Topic :: Scientific/Engineering :: Mathematics",
+            "Topic :: Scientific/Engineering :: Physics",
+            "Topic :: Scientific/Engineering :: Visualization",
+        ],
+        # build info
+        packages=["pycuda", "pycuda.gl", "pycuda.sparse", "pycuda.compyte"],
+        setup_requires=[
+            "numpy>=1.6",
+        ],
+        python_requires="~=3.6",
+        install_requires=[
+            "pytools>=2011.2",
+            "decorator>=3.2.0",
+            "appdirs>=1.4.0",
+            "mako",
+        ],
+        test_requires=[
+            "pytest>=2",
+        ],
+        ext_package="pycuda",
+        ext_modules=[
+            ExtensionUsingNumpy(
+                "_driver",
+                [
+                    "src/cpp/cuda.cpp",
+                    "src/cpp/bitlog.cpp",
+                    "src/wrapper/wrap_cudadrv.cpp",
+                    "src/wrapper/mempool.cpp",
+                ]
+                + EXTRA_SOURCES,
+                include_dirs=INCLUDE_DIRS,
+                library_dirs=LIBRARY_DIRS,
+                libraries=LIBRARIES,
+                define_macros=list(EXTRA_DEFINES.items()),
+                extra_compile_args=conf["CXXFLAGS"],
+                extra_link_args=conf["LDFLAGS"],
+            ),
+            ExtensionUsingNumpy(
+                "_pvt_struct",
+                [pvt_struct_source],
+                extra_compile_args=conf["CXXFLAGS"],
+                extra_link_args=conf["LDFLAGS"],
+            ),
+        ],
+        cmdclass={"build_ext": NumpyBuildExtCommand},
+        include_package_data=True,
+        package_data={
+            "pycuda": [
+                "cuda/*.hpp",
+            ]
+        },
+        zip_safe=False,
+    )
+
+
+if __name__ == "__main__":
     main()
diff --git a/test/test_cumath.py b/test/test_cumath.py
index 35fcbfa3bbd436126aab1e8a33ee48df8690392b..2db96d15573d1ba6878362ae734c6b387ddf25db 100644
--- a/test/test_cumath.py
+++ b/test/test_cumath.py
@@ -1,23 +1,11 @@
-from __future__ import division
-from __future__ import absolute_import
 import math
 import numpy as np
 from pycuda.tools import mark_cuda_test
-from six.moves import range
 
 
-def have_pycuda():
-    try:
-        import pycuda  # noqa
-        return True
-    except:
-        return False
-
-
-if have_pycuda():
-    import pycuda.gpuarray as gpuarray
-    import pycuda.driver as drv  # noqa
-    import pycuda.cumath as cumath
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv  # noqa
+import pycuda.cumath as cumath
 
 
 sizes = [10, 128, 1024, 1 << 10, 1 << 13]
@@ -26,10 +14,10 @@ complex_dtypes = [np.complex64, np.complex128]
 
 
 numpy_func_names = {
-        "asin": "arcsin",
-        "acos": "arccos",
-        "atan": "arctan",
-        }
+    "asin": "arcsin",
+    "acos": "arccos",
+    "atan": "arctan",
+}
 
 
 def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False):
@@ -44,71 +32,67 @@ def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False):
         for s in sizes:
             for dtype in _dtypes:
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 if complex:
-                    A += (np.random.random(s)*(b-a) + a)*1j
+                    A += (np.random.random(s) * (b - a) + a) * 1j
 
                 args = gpuarray.to_gpu(A)
                 gpu_results = gpu_func(args).get()
                 cpu_results = cpu_func(A)
 
                 max_err = np.max(np.abs(cpu_results - gpu_results))
-                assert (max_err <= threshold).all(), \
-                        (max_err, name, dtype)
+                assert (max_err <= threshold).all(), (max_err, name, dtype)
 
                 gpu_results2 = gpuarray.empty_like(args)
                 gr2 = gpu_func(args, out=gpu_results2)
                 assert gpu_results2 is gr2
                 gr2 = gr2.get()
                 max_err = np.max(np.abs(cpu_results - gr2))
-                assert (max_err <= threshold).all(), \
-                        (max_err, name, dtype)
+                assert (max_err <= threshold).all(), (max_err, name, dtype)
 
     return mark_cuda_test(test)
 
 
-if have_pycuda():
-    test_ceil = make_unary_function_test("ceil", -10, 10)
-    test_floor = make_unary_function_test("ceil", -10, 10)
-    test_fabs = make_unary_function_test("fabs", -10, 10)
-    test_exp = make_unary_function_test("exp", -3, 3, 1e-5)
-    test_exp_c = make_unary_function_test("exp", -3, 3, 1e-5, complex=True)
-    test_log = make_unary_function_test("log", 1e-5, 1, 5e-7)
-    test_log10 = make_unary_function_test("log10", 1e-5, 1, 3e-7)
-    test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7)
-
-    test_sin = make_unary_function_test("sin", -10, 10, 1e-7)
-    test_sin_c = make_unary_function_test("sin", -3, 3, 2.1e-6, complex=True)
-    test_cos = make_unary_function_test("cos", -10, 10, 1e-7)
-    test_cos_c = make_unary_function_test("cos", -3, 3, 2.1e-6, complex=True)
-    test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7)
-    #test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True)
-    test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7)
-    #test_acos_c = make_unary_function_test("acos", -0.9, 0.9, 2e-6, complex=True)
-    test_tan = make_unary_function_test("tan",
-            -math.pi/2 + 0.1, math.pi/2 - 0.1, 1e-5)
-    test_tan_c = make_unary_function_test("tan",
-            -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True)
-    test_atan = make_unary_function_test("atan", -10, 10, 2e-7)
-
-    test_sinh = make_unary_function_test("sinh", -3, 3, 2e-6)
-    test_sinh_c = make_unary_function_test("sinh", -3, 3, 3e-6, complex=True)
-    test_cosh = make_unary_function_test("cosh", -3, 3, 2e-6)
-    test_cosh_c = make_unary_function_test("cosh", -3, 3, 3e-6, complex=True)
-    test_tanh = make_unary_function_test("tanh", -3, 3, 2e-6)
-    test_tanh_c = make_unary_function_test("tanh",
-            -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True)
+test_ceil = make_unary_function_test("ceil", -10, 10)
+test_floor = make_unary_function_test("ceil", -10, 10)
+test_fabs = make_unary_function_test("fabs", -10, 10)
+test_exp = make_unary_function_test("exp", -3, 3, 1e-5)
+test_exp_c = make_unary_function_test("exp", -3, 3, 1e-5, complex=True)
+test_log = make_unary_function_test("log", 1e-5, 1, 5e-7)
+test_log10 = make_unary_function_test("log10", 1e-5, 1, 3e-7)
+test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7)
+
+test_sin = make_unary_function_test("sin", -10, 10, 1e-7)
+test_sin_c = make_unary_function_test("sin", -3, 3, 2.1e-6, complex=True)
+test_cos = make_unary_function_test("cos", -10, 10, 1e-7)
+test_cos_c = make_unary_function_test("cos", -3, 3, 2.1e-6, complex=True)
+test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7)
+# test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True)
+test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7)
+# test_acos_c = make_unary_function_test("acos", -0.9, 0.9, 2e-6, complex=True)
+test_tan = make_unary_function_test("tan", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 1e-5)
+test_tan_c = make_unary_function_test(
+    "tan", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 3e-5, complex=True
+)
+test_atan = make_unary_function_test("atan", -10, 10, 2e-7)
+
+test_sinh = make_unary_function_test("sinh", -3, 3, 2e-6)
+test_sinh_c = make_unary_function_test("sinh", -3, 3, 3e-6, complex=True)
+test_cosh = make_unary_function_test("cosh", -3, 3, 2e-6)
+test_cosh_c = make_unary_function_test("cosh", -3, 3, 3e-6, complex=True)
+test_tanh = make_unary_function_test("tanh", -3, 3, 2e-6)
+test_tanh_c = make_unary_function_test(
+    "tanh", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 3e-5, complex=True
+)
 
 
 class TestMath:
-    disabled = not have_pycuda()
-
     @mark_cuda_test
     def test_fmod(self):
         """tests if the fmod function works"""
         for s in sizes:
-            a = gpuarray.arange(s, dtype=np.float32)/10
-            a2 = gpuarray.arange(s, dtype=np.float32)/45.2 + 0.1
+            a = gpuarray.arange(s, dtype=np.float32) / 10
+            a2 = gpuarray.arange(s, dtype=np.float32) / 45.2 + 0.1
             b = cumath.fmod(a, a2)
 
             a = a.get()
@@ -123,7 +107,7 @@ class TestMath:
         """tests if the ldexp function works"""
         for s in sizes:
             a = gpuarray.arange(s, dtype=np.float32)
-            a2 = gpuarray.arange(s, dtype=np.float32)*1e-3
+            a2 = gpuarray.arange(s, dtype=np.float32) * 1e-3
             b = cumath.ldexp(a, a2)
 
             a = a.get()
@@ -137,7 +121,7 @@ class TestMath:
     def test_modf(self):
         """tests if the modf function works"""
         for s in sizes:
-            a = gpuarray.arange(s, dtype=np.float32)/10
+            a = gpuarray.arange(s, dtype=np.float32) / 10
             fracpart, intpart = cumath.modf(a)
 
             a = a.get()
@@ -154,7 +138,7 @@ class TestMath:
     def test_frexp(self):
         """tests if the frexp function works"""
         for s in sizes:
-            a = gpuarray.arange(s, dtype=np.float32)/10
+            a = gpuarray.arange(s, dtype=np.float32) / 10
             significands, exponents = cumath.frexp(a)
 
             a = a.get()
@@ -178,12 +162,12 @@ class TestMath:
         for s in sizes:
             for dtype in dtypes:
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 if complex:
-                    A = A + (np.random.random(s)*(b-a) + a)*1j
+                    A = A + (np.random.random(s) * (b - a) + a) * 1j
 
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 args = gpuarray.to_gpu(A)
 
                 # 'out' kw
@@ -203,7 +187,7 @@ class TestMath:
                 # 'stream' kw
                 mystream = Stream()
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 args = gpuarray.to_gpu(A)
                 gpu_results = gpuarray.empty_like(args)
                 gpu_results = gpu_func(args, stream=mystream).get()
@@ -214,7 +198,7 @@ class TestMath:
                 # 'stream' position
                 mystream = Stream()
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 args = gpuarray.to_gpu(A)
                 gpu_results = gpuarray.empty_like(args)
                 gpu_results = gpu_func(args, mystream).get()
@@ -225,7 +209,7 @@ class TestMath:
                 # 'out' and 'stream' kw
                 mystream = Stream()
                 np.random.seed(1)
-                A = (np.random.random(s)*(b-a) + a).astype(dtype)
+                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 args = gpuarray.to_gpu(A)
                 gpu_results = gpuarray.empty_like(args)
                 gpu_results = gpu_func(args, stream=mystream, out=gpu_results).get()
@@ -239,8 +223,10 @@ if __name__ == "__main__":
     import pycuda.autoinit  # noqa
 
     import sys
+
     if len(sys.argv) > 1:
-        exec (sys.argv[1])
+        exec(sys.argv[1])
     else:
         from pytest import main
+
         main([__file__])
diff --git a/test/test_driver.py b/test/test_driver.py
index 9ff010e6e647d8e87f122c504c0f7282d531c1bb..0c074efe05e93bd4a39ac20a22ed86586bf1a53a 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -1,43 +1,32 @@
-from __future__ import division, absolute_import, print_function
 import numpy as np
 import numpy.linalg as la
 from pycuda.tools import mark_cuda_test, dtype_to_ctype
 import pytest  # noqa
-from six.moves import range
 
 
-def have_pycuda():
-    try:
-        import pycuda  # noqa
-        return True
-    except Exception:
-        return False
-
-
-if have_pycuda():
-    import pycuda.gpuarray as gpuarray
-    import pycuda.driver as drv
-    from pycuda.compiler import SourceModule
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv
+from pycuda.compiler import SourceModule
 
 
 class TestDriver:
-    disabled = not have_pycuda()
-
     @mark_cuda_test
     def test_memory(self):
         z = np.random.randn(400).astype(np.float32)
         new_z = drv.from_device_like(drv.to_device(z), z)
-        assert la.norm(new_z-z) == 0
+        assert la.norm(new_z - z) == 0
 
     @mark_cuda_test
     def test_simple_kernel(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         __global__ void multiply_them(float *dest, float *a, float *b)
         {
           const int i = threadIdx.x;
           dest[i] = a[i] * b[i];
         }
-        """)
+        """
+        )
 
         multiply_them = mod.get_function("multiply_them")
 
@@ -45,20 +34,20 @@ class TestDriver:
         b = np.random.randn(400).astype(np.float32)
 
         dest = np.zeros_like(a)
-        multiply_them(
-                drv.Out(dest), drv.In(a), drv.In(b),
-                block=(400, 1, 1))
-        assert la.norm(dest-a*b) == 0
+        multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1))
+        assert la.norm(dest - a * b) == 0
 
     @mark_cuda_test
     def test_simple_kernel_2(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         __global__ void multiply_them(float *dest, float *a, float *b)
         {
           const int i = threadIdx.x;
           dest[i] = a[i] * b[i];
         }
-        """)
+        """
+        )
 
         multiply_them = mod.get_function("multiply_them")
 
@@ -68,29 +57,29 @@ class TestDriver:
         b_gpu = drv.to_device(b)
 
         dest = np.zeros_like(a)
-        multiply_them(
-                drv.Out(dest), a_gpu, b_gpu,
-                block=(400, 1, 1))
-        assert la.norm(dest-a*b) == 0
+        multiply_them(drv.Out(dest), a_gpu, b_gpu, block=(400, 1, 1))
+        assert la.norm(dest - a * b) == 0
 
         drv.Context.synchronize()
         # now try with offsets
         dest = np.zeros_like(a)
         multiply_them(
-                drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
-                block=(399, 1, 1))
+            drv.Out(dest), np.intp(a_gpu) + a.itemsize, b_gpu, block=(399, 1, 1)
+        )
 
-        assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
+        assert la.norm(dest[:-1] - a[1:] * b[:-1]) == 0
 
     @mark_cuda_test
     def test_vector_types(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         __global__ void set_them(float3 *dest, float3 x)
         {
           const int i = threadIdx.x;
           dest[i] = x;
         }
-        """)
+        """
+        )
 
         set_them = mod.get_function("set_them")
         a = gpuarray.vec.make_float3(1, 2, 3)
@@ -105,13 +94,15 @@ class TestDriver:
         # and data copying is asynchronous. Observe how this necessitates the
         # use of page-locked memory.
 
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         __global__ void multiply_them(float *dest, float *a, float *b)
         {
           const int i = threadIdx.x*blockDim.y + threadIdx.y;
           dest[i] = a[i] * b[i];
         }
-        """)
+        """
+        )
 
         multiply_them = mod.get_function("multiply_them")
 
@@ -130,28 +121,27 @@ class TestDriver:
         strm.synchronize()
 
         dest = drv.pagelocked_empty_like(a)
-        multiply_them(
-                drv.Out(dest), a_gpu, b_gpu,
-                block=shape+(1,), stream=strm)
+        multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm)
         strm.synchronize()
 
         drv.memcpy_dtoh_async(a, a_gpu, strm)
         drv.memcpy_dtoh_async(b, b_gpu, strm)
         strm.synchronize()
 
-        assert la.norm(dest-a*b) == 0
+        assert la.norm(dest - a * b) == 0
 
     @mark_cuda_test
     def test_gpuarray(self):
         a = np.arange(200000, dtype=np.float32)
         b = a + 17
         import pycuda.gpuarray as gpuarray
+
         a_g = gpuarray.to_gpu(a)
         b_g = gpuarray.to_gpu(b)
-        diff = (a_g-3*b_g+(-a_g)).get() - (a-3*b+(-a))
+        diff = (a_g - 3 * b_g + (-a_g)).get() - (a - 3 * b + (-a))
         assert la.norm(diff) == 0
 
-        diff = ((a_g*b_g).get()-a*b)
+        diff = (a_g * b_g).get() - a * b
         assert la.norm(diff) == 0
 
     @mark_cuda_test
@@ -162,14 +152,15 @@ class TestDriver:
 
         shape = (10,)
         a = blas.ones(shape, dtype=np.float32)
-        b = 33*blas.ones(shape, dtype=np.float32)
-        assert ((-a+b).from_gpu() == 32).all()
+        b = 33 * blas.ones(shape, dtype=np.float32)
+        assert ((-a + b).from_gpu() == 32).all()
 
         self.test_streamed_kernel()
 
     @mark_cuda_test
     def test_2d_texture(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         texture<float, 2, cudaReadModeElementType> mtx_tex;
 
         __global__ void copy_texture(float *dest)
@@ -179,7 +170,8 @@ class TestDriver:
           int w = blockDim.y;
           dest[row*w+col] = tex2D(mtx_tex, row, col);
         }
-        """)
+        """
+        )
 
         copy_texture = mod.get_function("copy_texture")
         mtx_tex = mod.get_texref("mtx_tex")
@@ -189,16 +181,13 @@ class TestDriver:
         drv.matrix_to_texref(a, mtx_tex, order="F")
 
         dest = np.zeros(shape, dtype=np.float32)
-        copy_texture(
-                drv.Out(dest),
-                block=shape+(1,),
-                texrefs=[mtx_tex]
-                )
-        assert la.norm(dest-a) == 0
+        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
+        assert la.norm(dest - a) == 0
 
     @mark_cuda_test
     def test_multiple_2d_textures(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         texture<float, 2, cudaReadModeElementType> mtx_tex;
         texture<float, 2, cudaReadModeElementType> mtx2_tex;
 
@@ -212,28 +201,27 @@ class TestDriver:
               +
               tex2D(mtx2_tex, row, col);
         }
-        """)
+        """
+        )
 
         copy_texture = mod.get_function("copy_texture")
         mtx_tex = mod.get_texref("mtx_tex")
         mtx2_tex = mod.get_texref("mtx2_tex")
 
-        shape = (3,4)
+        shape = (3, 4)
         a = np.random.randn(*shape).astype(np.float32)
         b = np.random.randn(*shape).astype(np.float32)
         drv.matrix_to_texref(a, mtx_tex, order="F")
         drv.matrix_to_texref(b, mtx2_tex, order="F")
 
         dest = np.zeros(shape, dtype=np.float32)
-        copy_texture(drv.Out(dest),
-                block=shape+(1,),
-                texrefs=[mtx_tex, mtx2_tex]
-                )
-        assert la.norm(dest-a-b) < 1e-6
+        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex, mtx2_tex])
+        assert la.norm(dest - a - b) < 1e-6
 
     @mark_cuda_test
     def test_multichannel_2d_texture(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         #define CHANNELS 4
         texture<float4, 2, cudaReadModeElementType> mtx_tex;
 
@@ -248,7 +236,8 @@ class TestDriver:
           dest[(row*w+col)*CHANNELS + 2] = texval.z;
           dest[(row*w+col)*CHANNELS + 3] = texval.w;
         }
-        """)
+        """
+        )
 
         copy_texture = mod.get_function("copy_texture")
         mtx_tex = mod.get_texref("mtx_tex")
@@ -256,25 +245,21 @@ class TestDriver:
         shape = (5, 6)
         channels = 4
         a = np.asarray(
-                np.random.randn(*((channels,)+shape)),
-                dtype=np.float32, order="F")
-        drv.bind_array_to_texref(
-            drv.make_multichannel_2d_array(a, order="F"), mtx_tex)
+            np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F"
+        )
+        drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex)
 
-        dest = np.zeros(shape+(channels,), dtype=np.float32)
-        copy_texture(
-                drv.Out(dest),
-                block=shape+(1,),
-                texrefs=[mtx_tex]
-                )
+        dest = np.zeros(shape + (channels,), dtype=np.float32)
+        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
         reshaped_a = a.transpose(1, 2, 0)
-        #print reshaped_a
-        #print dest
-        assert la.norm(dest-reshaped_a) == 0
+        # print reshaped_a
+        # print dest
+        assert la.norm(dest - reshaped_a) == 0
 
     @mark_cuda_test
     def test_multichannel_linear_texture(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         #define CHANNELS 4
         texture<float4, 1, cudaReadModeElementType> mtx_tex;
 
@@ -287,43 +272,45 @@ class TestDriver:
           dest[i*CHANNELS + 2] = texval.z;
           dest[i*CHANNELS + 3] = texval.w;
         }
-        """)
+        """
+        )
 
         copy_texture = mod.get_function("copy_texture")
         mtx_tex = mod.get_texref("mtx_tex")
 
         shape = (16, 16)
         channels = 4
-        a = np.random.randn(*(shape+(channels,))).astype(np.float32)
+        a = np.random.randn(*(shape + (channels,))).astype(np.float32)
         a_gpu = drv.to_device(a)
         mtx_tex.set_address(a_gpu, a.nbytes)
         mtx_tex.set_format(drv.array_format.FLOAT, 4)
 
-        dest = np.zeros(shape+(channels,), dtype=np.float32)
-        copy_texture(drv.Out(dest),
-                block=shape+(1,),
-                texrefs=[mtx_tex]
-                )
-        #print a
-        #print dest
-        assert la.norm(dest-a) == 0
+        dest = np.zeros(shape + (channels,), dtype=np.float32)
+        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
+        # print a
+        # print dest
+        assert la.norm(dest - a) == 0
 
     @mark_cuda_test
     def test_2d_fp_textures(self):
         orden = "F"
         npoints = 32
 
-        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
+        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
             prec_str = dtype_to_ctype(prec)
-            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
-            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
-            elif prec == np.float64: fpName_str = 'fp_tex_double'
-            else: fpName_str = prec_str
-            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
-            A_cpu[:] = np.random.rand(npoints,npoints)[:]
-            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)
-
-            myKern = '''
+            if prec == np.complex64:
+                fpName_str = "fp_tex_cfloat"
+            elif prec == np.complex128:
+                fpName_str = "fp_tex_cdouble"
+            elif prec == np.float64:
+                fpName_str = "fp_tex_double"
+            else:
+                fpName_str = prec_str
+            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
+            A_cpu[:] = np.random.rand(npoints, npoints)[:]
+            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)
+
+            myKern = """
             #include <pycuda-helpers.hpp>
             texture<fpName, 2, cudaReadModeElementType> mtx_tex;
 
@@ -334,40 +321,50 @@ class TestDriver:
 
               dest[row + col*blockDim.x*gridDim.x] = fp_tex2D(mtx_tex, col, row);
             }
-            '''
-            myKern = myKern.replace('fpName',fpName_str)
-            myKern = myKern.replace('cuPres',prec_str)
+            """
+            myKern = myKern.replace("fpName", fpName_str)
+            myKern = myKern.replace("cuPres", prec_str)
             mod = SourceModule(myKern)
 
             copy_texture = mod.get_function("copy_texture")
             mtx_tex = mod.get_texref("mtx_tex")
-            cuBlock = (16,16,1)
-            if cuBlock[0]>npoints:
-                cuBlock = (npoints,npoints,1)
-            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
-            copy_texture.prepare('P',texrefs=[mtx_tex])
-            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False)
+            cuBlock = (16, 16, 1)
+            if cuBlock[0] > npoints:
+                cuBlock = (npoints, npoints, 1)
+            cuGrid = (
+                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
+                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
+                1,
+            )
+            copy_texture.prepare("P", texrefs=[mtx_tex])
+            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
             mtx_tex.set_array(cudaArray)
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
-            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
+            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
+            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
+                0, dtype=prec
+            )
             A_gpu.gpudata.free()
 
     @mark_cuda_test
-    def test_2d_fp_texturesLayered(self):
+    def test_2d_fp_textures_layered(self):
         orden = "F"
         npoints = 32
 
-        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
+        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
             prec_str = dtype_to_ctype(prec)
-            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
-            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
-            elif prec == np.float64: fpName_str = 'fp_tex_double'
-            else: fpName_str = prec_str
-            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
-            A_cpu[:] = np.random.rand(npoints,npoints)[:]
-            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)
-
-            myKern = '''
+            if prec == np.complex64:
+                fpName_str = "fp_tex_cfloat"
+            elif prec == np.complex128:
+                fpName_str = "fp_tex_cdouble"
+            elif prec == np.float64:
+                fpName_str = "fp_tex_double"
+            else:
+                fpName_str = prec_str
+            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
+            A_cpu[:] = np.random.rand(npoints, npoints)[:]
+            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)
+
+            myKern = """
             #include <pycuda-helpers.hpp>
             texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;
 
@@ -378,22 +375,28 @@ class TestDriver:
 
               dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
             }
-            '''
-            myKern = myKern.replace('fpName',fpName_str)
-            myKern = myKern.replace('cuPres',prec_str)
+            """
+            myKern = myKern.replace("fpName", fpName_str)
+            myKern = myKern.replace("cuPres", prec_str)
             mod = SourceModule(myKern)
 
             copy_texture = mod.get_function("copy_texture")
             mtx_tex = mod.get_texref("mtx_tex")
-            cuBlock = (16,16,1)
-            if cuBlock[0]>npoints:
-                cuBlock = (npoints,npoints,1)
-            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
-            copy_texture.prepare('P',texrefs=[mtx_tex])
-            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True)
+            cuBlock = (16, 16, 1)
+            if cuBlock[0] > npoints:
+                cuBlock = (npoints, npoints, 1)
+            cuGrid = (
+                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
+                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
+                1,
+            )
+            copy_texture.prepare("P", texrefs=[mtx_tex])
+            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True)
             mtx_tex.set_array(cudaArray)
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
-            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
+            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
+            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
+                0, dtype=prec
+            )
             A_gpu.gpudata.free()
 
     @mark_cuda_test
@@ -401,17 +404,21 @@ class TestDriver:
         orden = "C"
         npoints = 32
 
-        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
+        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
             prec_str = dtype_to_ctype(prec)
-            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
-            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
-            elif prec == np.float64: fpName_str = 'fp_tex_double'
-            else: fpName_str = prec_str
-            A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
-            A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:]
-            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)
-
-            myKern = '''
+            if prec == np.complex64:
+                fpName_str = "fp_tex_cfloat"
+            elif prec == np.complex128:
+                fpName_str = "fp_tex_cdouble"
+            elif prec == np.float64:
+                fpName_str = "fp_tex_double"
+            else:
+                fpName_str = prec_str
+            A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
+            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
+            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)
+
+            myKern = """
             #include <pycuda-helpers.hpp>
             texture<fpName, 3, cudaReadModeElementType> mtx_tex;
 
@@ -422,22 +429,28 @@ class TestDriver:
               int slice = blockIdx.z*blockDim.z + threadIdx.z;
               dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
             }
-            '''
-            myKern = myKern.replace('fpName',fpName_str)
-            myKern = myKern.replace('cuPres',prec_str)
+            """
+            myKern = myKern.replace("fpName", fpName_str)
+            myKern = myKern.replace("cuPres", prec_str)
             mod = SourceModule(myKern)
 
             copy_texture = mod.get_function("copy_texture")
             mtx_tex = mod.get_texref("mtx_tex")
-            cuBlock = (8,8,8)
-            if cuBlock[0]>npoints:
-                cuBlock = (npoints,npoints,npoints)
-            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 ))
-            copy_texture.prepare('P',texrefs=[mtx_tex])
-            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False)
+            cuBlock = (8, 8, 8)
+            if cuBlock[0] > npoints:
+                cuBlock = (npoints, npoints, npoints)
+            cuGrid = (
+                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
+                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
+                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
+            )
+            copy_texture.prepare("P", texrefs=[mtx_tex])
+            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
             mtx_tex.set_array(cudaArray)
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
-            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
+            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
+            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
+                0, dtype=prec
+            )
             A_gpu.gpudata.free()
 
     @mark_cuda_test
@@ -445,30 +458,30 @@ class TestDriver:
         orden = "C"
         npoints = 32
 
-        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
+        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
             prec_str = dtype_to_ctype(prec)
             if prec == np.complex64:
-                fpName_str = 'fp_tex_cfloat'
-                A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
-                A_cpu[:].real = np.random.rand(npoints,npoints,npoints)[:]
-                A_cpu[:].imag = np.random.rand(npoints,npoints,npoints)[:]
+                fpName_str = "fp_tex_cfloat"
+                A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
+                A_cpu[:].real = np.random.rand(npoints, npoints, npoints)[:]
+                A_cpu[:].imag = np.random.rand(npoints, npoints, npoints)[:]
             elif prec == np.complex128:
-                fpName_str = 'fp_tex_cdouble'
-                A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
-                A_cpu[:].real = np.random.rand(npoints,npoints,npoints)[:]
-                A_cpu[:].imag = np.random.rand(npoints,npoints,npoints)[:]
+                fpName_str = "fp_tex_cdouble"
+                A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
+                A_cpu[:].real = np.random.rand(npoints, npoints, npoints)[:]
+                A_cpu[:].imag = np.random.rand(npoints, npoints, npoints)[:]
             elif prec == np.float64:
-                fpName_str = 'fp_tex_double'
-                A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
-                A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:]
+                fpName_str = "fp_tex_double"
+                A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
+                A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
             else:
                 fpName_str = prec_str
-                A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
-                A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:]*100.
+                A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
+                A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] * 100.0
 
-            A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized
+            A_gpu = gpuarray.to_gpu(A_cpu)  # Array randomized
 
-            myKernRW = '''
+            myKernRW = """
             #include <pycuda-helpers.hpp>
 
             surface<void, cudaSurfaceType3D> mtx_tex;
@@ -488,25 +501,35 @@ class TestDriver:
               dest[tid] = aux;
               }
             }
-            '''
-            myKernRW = myKernRW.replace('fpName',fpName_str)
-            myKernRW = myKernRW.replace('cuPres',prec_str)
+            """
+            myKernRW = myKernRW.replace("fpName", fpName_str)
+            myKernRW = myKernRW.replace("cuPres", prec_str)
             modW = SourceModule(myKernRW)
 
             copy_texture = modW.get_function("copy_texture")
             mtx_tex = modW.get_surfref("mtx_tex")
-            cuBlock = (8,8,8)
-            if cuBlock[0]>npoints:
-                cuBlock = (npoints,npoints,npoints)
-            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 ))
-            copy_texture.prepare('Pi')#,texrefs=[mtx_tex])
-            A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros
-            cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True)
-            A_cpu = A_gpu.get() # To remember original array
+            cuBlock = (8, 8, 8)
+            if cuBlock[0] > npoints:
+                cuBlock = (npoints, npoints, npoints)
+            cuGrid = (
+                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
+                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
+                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
+            )
+            copy_texture.prepare("Pi")  # ,texrefs=[mtx_tex])
+            A_gpu2 = gpuarray.zeros_like(A_gpu)  # To initialize surface with zeros
+            cudaArray = drv.gpuarray_to_array(A_gpu2, orden, allowSurfaceBind=True)
+            A_cpu = A_gpu.get()  # To remember original array
             mtx_tex.set_array(cudaArray)
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed
-            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
+            copy_texture.prepared_call(
+                cuGrid, cuBlock, A_gpu.gpudata, np.int32(0)
+            )  # Write random array
+            copy_texture.prepared_call(
+                cuGrid, cuBlock, A_gpu.gpudata, np.int32(1)
+            )  # Read, but transposed
+            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
+                0, dtype=prec
+            )
             A_gpu.gpudata.free()
 
     @mark_cuda_test
@@ -514,17 +537,21 @@ class TestDriver:
         orden = "C"
         npoints = 32
 
-        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
+        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
             prec_str = dtype_to_ctype(prec)
-            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
-            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
-            elif prec == np.float64: fpName_str = 'fp_tex_double'
-            else: fpName_str = prec_str
-            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
-            A_cpu[:] = np.random.rand(npoints,npoints)[:]
-            A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized
-
-            myKernRW = '''
+            if prec == np.complex64:
+                fpName_str = "fp_tex_cfloat"
+            elif prec == np.complex128:
+                fpName_str = "fp_tex_cdouble"
+            elif prec == np.float64:
+                fpName_str = "fp_tex_double"
+            else:
+                fpName_str = prec_str
+            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
+            A_cpu[:] = np.random.rand(npoints, npoints)[:]
+            A_gpu = gpuarray.to_gpu(A_cpu)  # Array randomized
+
+            myKernRW = """
             #include <pycuda-helpers.hpp>
 
             surface<void, cudaSurfaceType2DLayered> mtx_tex;
@@ -544,31 +571,42 @@ class TestDriver:
               dest[tid] = aux;
               }
             }
-            '''
-            myKernRW = myKernRW.replace('fpName',fpName_str)
-            myKernRW = myKernRW.replace('cuPres',prec_str)
+            """
+            myKernRW = myKernRW.replace("fpName", fpName_str)
+            myKernRW = myKernRW.replace("cuPres", prec_str)
             modW = SourceModule(myKernRW)
 
             copy_texture = modW.get_function("copy_texture")
             mtx_tex = modW.get_surfref("mtx_tex")
-            cuBlock = (8,8,1)
-            if cuBlock[0]>npoints:
-                cuBlock = (npoints,npoints,1)
-            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
-            copy_texture.prepare('Pi')#,texrefs=[mtx_tex])
-            A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros
-            cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True)
-            A_cpu = A_gpu.get() # To remember original array
+            cuBlock = (8, 8, 1)
+            if cuBlock[0] > npoints:
+                cuBlock = (npoints, npoints, 1)
+            cuGrid = (
+                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
+                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
+                1,
+            )
+            copy_texture.prepare("Pi")  # ,texrefs=[mtx_tex])
+            A_gpu2 = gpuarray.zeros_like(A_gpu)  # To initialize surface with zeros
+            cudaArray = drv.gpuarray_to_array(A_gpu2, orden, allowSurfaceBind=True)
+            A_cpu = A_gpu.get()  # To remember original array
             mtx_tex.set_array(cudaArray)
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array
-            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed
-            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
+            copy_texture.prepared_call(
+                cuGrid, cuBlock, A_gpu.gpudata, np.int32(0)
+            )  # Write random array
+            copy_texture.prepared_call(
+                cuGrid, cuBlock, A_gpu.gpudata, np.int32(1)
+            )  # Read, but transposed
+            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
+                0, dtype=prec
+            )
             A_gpu.gpudata.free()
 
     @mark_cuda_test
     def test_large_smem(self):
         n = 4000
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         #include <stdio.h>
 
         __global__ void kernel(int *d_data)
@@ -577,36 +615,47 @@ class TestDriver:
         sdata[threadIdx.x] = threadIdx.x;
         d_data[threadIdx.x] = sdata[threadIdx.x];
         }
-        """ % n)
+        """
+            % n
+        )
 
         kernel = mod.get_function("kernel")
 
         import pycuda.gpuarray as gpuarray
+
         arg = gpuarray.zeros((n,), dtype=np.float32)
 
-        kernel(arg, block=(1,1,1,), )
+        kernel(
+            arg,
+            block=(
+                1,
+                1,
+                1,
+            ),
+        )
 
     @mark_cuda_test
     def test_bitlog(self):
         from pycuda.tools import bitlog2
+
         assert bitlog2(17) == 4
-        assert bitlog2(0xaffe) == 15
-        assert bitlog2(0x3affe) == 17
-        assert bitlog2(0xcc3affe) == 27
+        assert bitlog2(0xAFFE) == 15
+        assert bitlog2(0x3AFFE) == 17
+        assert bitlog2(0xCC3AFFE) == 27
 
     @mark_cuda_test
     def test_mempool_2(self):
-        from pycuda.tools import DeviceMemoryPool as DMP
+        from pycuda.tools import DeviceMemoryPool
         from random import randrange
 
         for i in range(2000):
-            s = randrange(1<<31) >> randrange(32)
-            bin_nr = DMP.bin_number(s)
-            asize = DMP.alloc_size(bin_nr)
+            s = randrange(1 << 31) >> randrange(32)
+            bin_nr = DeviceMemoryPool.bin_number(s)
+            asize = DeviceMemoryPool.alloc_size(bin_nr)
 
             assert asize >= s, s
-            assert DMP.bin_number(asize) == bin_nr, s
-            assert asize < asize*(1+1/8)
+            assert DeviceMemoryPool.bin_number(asize) == bin_nr, s
+            assert asize < asize * (1 + 1 / 8)
 
     @mark_cuda_test
     def test_mempool(self):
@@ -614,13 +663,12 @@ class TestDriver:
         from pycuda.tools import DeviceMemoryPool
 
         pool = DeviceMemoryPool()
-        maxlen = 10
         queue = []
         free, total = drv.mem_get_info()
 
         e0 = bitlog2(free)
 
-        for e in range(e0-6, e0-4):
+        for e in range(e0 - 6, e0 - 4):
             for i in range(100):
                 queue.append(pool.allocate(1 << e))
                 if len(queue) > 10:
@@ -669,9 +717,7 @@ class TestDriver:
         d = 8
         shape = (w, h, d)
 
-        a = np.asarray(
-                np.random.randn(*shape),
-                dtype=np.float32, order="F")
+        a = np.asarray(np.random.randn(*shape), dtype=np.float32, order="F")
 
         descr = drv.ArrayDescriptor3D()
         descr.width = w
@@ -692,7 +738,8 @@ class TestDriver:
 
         copy()
 
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         texture<float, 3, cudaReadModeElementType> mtx_tex;
 
         __global__ void copy_texture(float *dest)
@@ -706,7 +753,8 @@ class TestDriver:
           dest[i] = tex3D(mtx_tex, x, y, z);
           //dest[i] = x;
         }
-        """)
+        """
+        )
 
         copy_texture = mod.get_function("copy_texture")
         mtx_tex = mod.get_texref("mtx_tex")
@@ -715,47 +763,49 @@ class TestDriver:
 
         dest = np.zeros(shape, dtype=np.float32, order="F")
         copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
-        assert la.norm(dest-a) == 0
+        assert la.norm(dest - a) == 0
 
     @mark_cuda_test
     def test_prepared_invocation(self):
-        a = np.random.randn(4,4).astype(np.float32)
+        a = np.random.randn(4, 4).astype(np.float32)
         a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)
 
         drv.memcpy_htod(a_gpu, a)
 
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
             __global__ void doublify(float *a)
             {
               int idx = threadIdx.x + threadIdx.y*blockDim.x;
               a[idx] *= 2;
             }
-            """)
+            """
+        )
 
         func = mod.get_function("doublify")
         func.prepare("P")
-        func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
+        func.prepared_call((1, 1), (4, 4, 1), a_gpu, shared_size=20)
         a_doubled = np.empty_like(a)
         drv.memcpy_dtoh(a_doubled, a_gpu)
-        print (a)
-        print (a_doubled)
-        assert la.norm(a_doubled-2*a) == 0
+        print(a)
+        print(a_doubled)
+        assert la.norm(a_doubled - 2 * a) == 0
 
         # now with offsets
         func.prepare("P")
         a_quadrupled = np.empty_like(a)
-        func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
+        func.prepared_call((1, 1), (15, 1, 1), int(a_gpu) + a.dtype.itemsize)
         drv.memcpy_dtoh(a_quadrupled, a_gpu)
-        assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
+        assert la.norm(a_quadrupled[1:] - 4 * a[1:]) == 0
 
     @mark_cuda_test
     def test_prepared_with_vector(self):
-        cuda_source = r'''
+        cuda_source = r"""
         __global__ void cuda_function(float3 input)
         {
         float3 result = make_float3(input.x, input.y, input.z);
         }
-        '''
+        """
 
         mod = SourceModule(cuda_source, cache_dir=False, keep=False)
 
@@ -763,8 +813,9 @@ class TestDriver:
         arg_types = [gpuarray.vec.float3]
 
         kernel.prepare(arg_types)
-        kernel.prepared_call((1, 1, 1), (1, 1, 1),
-                gpuarray.vec.make_float3(0.0, 1.0, 2.0))
+        kernel.prepared_call(
+            (1, 1, 1), (1, 1, 1), gpuarray.vec.make_float3(0.0, 1.0, 2.0)
+        )
 
     @mark_cuda_test
     def test_fp_textures(self):
@@ -772,10 +823,9 @@ class TestDriver:
             return
 
         for tp in [np.float32, np.float64]:
-            from pycuda.tools import dtype_to_ctype
-
             tp_cstr = dtype_to_ctype(tp)
-            mod = SourceModule("""
+            mod = SourceModule(
+                """
             #include <pycuda-helpers.hpp>
 
             texture<fp_tex_%(tp)s, 1, cudaReadModeElementType> my_tex;
@@ -785,49 +835,55 @@ class TestDriver:
               int i = threadIdx.x;
               dest[i] = fp_tex1Dfetch(my_tex, i);
             }
-            """ % {"tp": tp_cstr})
+            """
+                % {"tp": tp_cstr}
+            )
 
             copy_texture = mod.get_function("copy_texture")
             my_tex = mod.get_texref("my_tex")
 
-            import pycuda.gpuarray as gpuarray
-
             shape = (384,)
             a = np.random.randn(*shape).astype(tp)
             a_gpu = gpuarray.to_gpu(a)
             a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True)
 
             dest = np.zeros(shape, dtype=tp)
-            copy_texture(drv.Out(dest),
-                    block=shape+(1,1,),
-                    texrefs=[my_tex])
+            copy_texture(
+                drv.Out(dest),
+                block=shape
+                + (
+                    1,
+                    1,
+                ),
+                texrefs=[my_tex],
+            )
 
-            assert la.norm(dest-a) == 0
+            assert la.norm(dest - a) == 0
 
     @mark_cuda_test
     def test_constant_memory(self):
         # contributed by Andrew Wagner
 
-        module = SourceModule("""
+        module = SourceModule(
+            """
         __constant__ float const_array[32];
 
         __global__ void copy_constant_into_global(float* global_result_array)
         {
             global_result_array[threadIdx.x] = const_array[threadIdx.x];
         }
-        """)
+        """
+        )
 
         copy_constant_into_global = module.get_function("copy_constant_into_global")
-        const_array, _ = module.get_global('const_array')
+        const_array, _ = module.get_global("const_array")
 
-        host_array = np.random.randint(0,255,(32,)).astype(np.float32)
+        host_array = np.random.randint(0, 255, (32,)).astype(np.float32)
 
         global_result_array = drv.mem_alloc_like(host_array)
         drv.memcpy_htod(const_array, host_array)
 
-        copy_constant_into_global(
-                global_result_array,
-                grid=(1, 1), block=(32, 1, 1))
+        copy_constant_into_global(global_result_array, grid=(1, 1), block=(32, 1, 1))
 
         host_result_array = np.zeros_like(host_array)
         drv.memcpy_dtoh(host_result_array, global_result_array)
@@ -838,14 +894,17 @@ class TestDriver:
     def test_register_host_memory(self):
         if drv.get_version() < (4,):
             from py.test import skip
+
             skip("register_host_memory only exists on CUDA 4.0 and later")
 
         import sys
+
         if sys.platform == "darwin":
             from py.test import skip
+
             skip("register_host_memory is not supported on OS X")
 
-        a = drv.aligned_empty((2**20,), np.float64)
+        a = drv.aligned_empty((2 ** 20,), np.float64)
         a_pin = drv.register_host_memory(a)
 
         gpu_ary = drv.mem_alloc_like(a)
@@ -860,6 +919,7 @@ class TestDriver:
 
         if drv.Context.get_device().compute_capability() < (3, 5):
             from pytest import skip
+
             skip("need compute capability 3.5 or higher for dynamic parallelism")
 
         cuda_string = """
@@ -909,11 +969,20 @@ class TestDriver:
             drv.memcpy_htod(b_gpu, b)
 
             from pycuda.compiler import DynamicSourceModule
+
             mod = DynamicSourceModule(cuda_string, keep=True)
 
             func = mod.get_function("math")
-            func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
-                    block=(100, 1, 1), grid=(1, 1, 1))
+            func(
+                a_gpu,
+                b_gpu,
+                c_gpu,
+                d_gpu,
+                e_gpu,
+                f_gpu,
+                block=(100, 1, 1),
+                grid=(1, 1, 1),
+            )
 
             drv.memcpy_dtoh(c, c_gpu)
             drv.memcpy_dtoh(d, d_gpu)
@@ -927,41 +996,46 @@ class TestDriver:
         c = np.empty_like(a)
         d = np.empty_like(a)
         e = np.empty_like(a)
-        f = np.array(a, dtype='d')
+        f = np.array(a, dtype="d")
 
         math(a, b, c, d, e, f)
 
     @mark_cuda_test
     def test_jit_link_module(self):
         from pycuda.compiler import DEFAULT_NVCC_FLAGS
+
         if drv.Context.get_device().compute_capability() < (3, 5):
             from pytest import skip
+
             skip("need compute capability 3.5 or higher for dynamic parallelism")
 
-        test_outer_cu = '''#include <cstdio>
+        test_outer_cu = """#include <cstdio>
         __global__ void test_kernel() {
             extern __global__ void test_kernel_inner();
             printf("Hello outer world!\\n");
             test_kernel_inner<<<2, 1>>>();
-        }'''
+        }"""
 
-        test_inner_cu = '''#include <cstdio>
+        test_inner_cu = """#include <cstdio>
         __global__ void test_kernel_inner() {
             printf("  Hello inner world!\\n");
-        }'''
+        }"""
 
         from pycuda.compiler import DynamicModule
+
         mod = DynamicModule()
         mod.add_source(
-                test_outer_cu, nvcc_options=(
-                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
+            test_outer_cu,
+            nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS),
+        )
         mod.add_source(
-                test_inner_cu, nvcc_options=(
-                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
-        mod.add_stdlib('cudadevrt')
+            test_inner_cu,
+            nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS),
+        )
+        mod.add_stdlib("cudadevrt")
         mod.link()
 
-        test_kernel = mod.get_function('test_kernel')
+        test_kernel = mod.get_function("test_kernel")
         test_kernel(grid=(2, 1), block=(1, 1, 1))
 
 
@@ -978,8 +1052,10 @@ if __name__ == "__main__":
     import pycuda.autoinit  # noqa
 
     import sys
+
     if len(sys.argv) > 1:
         exec(sys.argv[1])
     else:
         from pytest import main
+
         main([__file__])
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index b8f2d43e3addf57a723db48fdf52624a4eee209a..fb6a20fcffc6943bf814d8a2fb0d421ba17cb8f9 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1,39 +1,27 @@
 #! /usr/bin/env python
 
-from __future__ import absolute_import, print_function
 import numpy as np
 import numpy.linalg as la
 import sys
 from pycuda.tools import mark_cuda_test
 from pycuda.characterize import has_double_support
-from six.moves import range
 
 
-def have_pycuda():
-    try:
-        import pycuda  # noqa
-        return True
-    except:
-        return False
-
-if have_pycuda():
-    import pycuda.gpuarray as gpuarray
-    import pycuda.driver as drv
-    from pycuda.compiler import SourceModule
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv
+from pycuda.compiler import SourceModule
 
 
 class TestGPUArray:
-    disabled = not have_pycuda()
-
     @mark_cuda_test
     def test_pow_array(self):
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
 
         result = pow(a_gpu, a_gpu).get()
-        assert (np.abs(a**a - result) < 1e-3).all()
+        assert (np.abs(a ** a - result) < 1e-3).all()
 
-        result = (a_gpu**a_gpu).get()
+        result = (a_gpu ** a_gpu).get()
         assert (np.abs(pow(a, a) - result) < 1e-3).all()
 
         a_gpu **= a_gpu
@@ -46,11 +34,11 @@ class TestGPUArray:
         a_gpu = gpuarray.to_gpu(a)
 
         result = pow(a_gpu, 2).get()
-        assert (np.abs(a**2 - result) < 1e-3).all()
+        assert (np.abs(a ** 2 - result) < 1e-3).all()
 
         a_gpu **= 2
         a_gpu = a_gpu.get()
-        assert (np.abs(a**2 - a_gpu) < 1e-3).all()
+        assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
 
     @mark_cuda_test
     def test_numpy_integer_shape(self):
@@ -90,10 +78,7 @@ class TestGPUArray:
         """Test the muliplication of an array with a scalar. """
 
         for sz in [10, 50000]:
-            for dtype, scalars in [
-                    (np.float32, [2]),
-                    (np.complex64, [2, 2j])
-                    ]:
+            for dtype, scalars in [(np.float32, [2]), (np.complex64, [2, 2j])]:
                 for scalar in scalars:
                     a = np.arange(sz).astype(dtype)
                     a_gpu = gpuarray.to_gpu(a)
@@ -106,10 +91,10 @@ class TestGPUArray:
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
 
-        two_a = 2*a_gpu
+        two_a = 2 * a_gpu
         assert isinstance(two_a, gpuarray.GPUArray)
 
-        two_a = np.float32(2)*a_gpu
+        two_a = np.float32(2) * a_gpu
         assert isinstance(two_a, gpuarray.GPUArray)
 
     @mark_cuda_test
@@ -121,9 +106,9 @@ class TestGPUArray:
         a_gpu = gpuarray.to_gpu(a)
         b_gpu = gpuarray.to_gpu(a)
 
-        a_squared = (b_gpu*a_gpu).get()
+        a_squared = (b_gpu * a_gpu).get()
 
-        assert (a*a == a_squared).all()
+        assert (a * a == a_squared).all()
 
     @mark_cuda_test
     def test_addition_array(self):
@@ -131,9 +116,9 @@ class TestGPUArray:
 
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
-        a_added = (a_gpu+a_gpu).get()
+        a_added = (a_gpu + a_gpu).get()
 
-        assert (a+a == a_added).all()
+        assert (a + a == a_added).all()
 
     @mark_cuda_test
     def test_iaddition_array(self):
@@ -144,7 +129,7 @@ class TestGPUArray:
         a_gpu += a_gpu
         a_added = a_gpu.get()
 
-        assert (a+a == a_added).all()
+        assert (a + a == a_added).all()
 
     @mark_cuda_test
     def test_addition_scalar(self):
@@ -152,9 +137,9 @@ class TestGPUArray:
 
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
-        a_added = (7+a_gpu).get()
+        a_added = (7 + a_gpu).get()
 
-        assert (7+a == a_added).all()
+        assert (7 + a == a_added).all()
 
     @mark_cuda_test
     def test_iaddition_scalar(self):
@@ -165,39 +150,39 @@ class TestGPUArray:
         a_gpu += 7
         a_added = a_gpu.get()
 
-        assert (7+a == a_added).all()
+        assert (7 + a == a_added).all()
 
     @mark_cuda_test
     def test_substract_array(self):
         """Test the substraction of two arrays."""
-        #test data
+        # test data
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
 
         a_gpu = gpuarray.to_gpu(a)
         b_gpu = gpuarray.to_gpu(b)
 
-        result = (a_gpu-b_gpu).get()
-        assert (a-b == result).all()
+        result = (a_gpu - b_gpu).get()
+        assert (a - b == result).all()
 
-        result = (b_gpu-a_gpu).get()
-        assert (b-a == result).all()
+        result = (b_gpu - a_gpu).get()
+        assert (b - a == result).all()
 
     @mark_cuda_test
     def test_substract_scalar(self):
         """Test the substraction of an array and a scalar."""
 
-        #test data
+        # test data
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
 
-        #convert a to a gpu object
+        # convert a to a gpu object
         a_gpu = gpuarray.to_gpu(a)
 
-        result = (a_gpu-7).get()
-        assert (a-7 == result).all()
+        result = (a_gpu - 7).get()
+        assert (a - 7 == result).all()
 
-        result = (7-a_gpu).get()
-        assert (7-a == result).all()
+        result = (7 - a_gpu).get()
+        assert (7 - a == result).all()
 
     @mark_cuda_test
     def test_divide_scalar(self):
@@ -206,28 +191,28 @@ class TestGPUArray:
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
 
-        result = (a_gpu/2).get()
-        assert (a/2 == result).all()
+        result = (a_gpu / 2).get()
+        assert (a / 2 == result).all()
 
-        result = (2/a_gpu).get()
-        assert (2/a == result).all()
+        result = (2 / a_gpu).get()
+        assert (2 / a == result).all()
 
     @mark_cuda_test
     def test_divide_array(self):
         """Test the division of an array and a scalar. """
 
-        #test data
+        # test data
         a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
         b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
 
         a_gpu = gpuarray.to_gpu(a)
         b_gpu = gpuarray.to_gpu(b)
 
-        a_divide = (a_gpu/b_gpu).get()
-        assert (np.abs(a/b - a_divide) < 1e-3).all()
+        a_divide = (a_gpu / b_gpu).get()
+        assert (np.abs(a / b - a_divide) < 1e-3).all()
 
-        a_divide = (b_gpu/a_gpu).get()
-        assert (np.abs(b/a - a_divide) < 1e-3).all()
+        a_divide = (b_gpu / a_gpu).get()
+        assert (np.abs(b / a - a_divide) < 1e-3).all()
 
     @mark_cuda_test
     def test_random(self):
@@ -247,29 +232,39 @@ class TestGPUArray:
     @mark_cuda_test
     def test_curand_wrappers(self):
         from pycuda.curandom import get_curand_version
+
         if get_curand_version() is None:
             from pytest import skip
+
             skip("curand not installed")
 
         generator_types = []
         if get_curand_version() >= (3, 2, 0):
             from pycuda.curandom import (
-                    XORWOWRandomNumberGenerator,
-                    Sobol32RandomNumberGenerator)
-            generator_types.extend([
-                    XORWOWRandomNumberGenerator,
-                    Sobol32RandomNumberGenerator])
+                XORWOWRandomNumberGenerator,
+                Sobol32RandomNumberGenerator,
+            )
+
+            generator_types.extend(
+                [XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator]
+            )
         if get_curand_version() >= (4, 0, 0):
             from pycuda.curandom import (
+                ScrambledSobol32RandomNumberGenerator,
+                Sobol64RandomNumberGenerator,
+                ScrambledSobol64RandomNumberGenerator,
+            )
+
+            generator_types.extend(
+                [
                     ScrambledSobol32RandomNumberGenerator,
                     Sobol64RandomNumberGenerator,
-                    ScrambledSobol64RandomNumberGenerator)
-            generator_types.extend([
-                    ScrambledSobol32RandomNumberGenerator,
-                    Sobol64RandomNumberGenerator,
-                    ScrambledSobol64RandomNumberGenerator])
+                    ScrambledSobol64RandomNumberGenerator,
+                ]
+            )
         if get_curand_version() >= (4, 1, 0):
             from pycuda.curandom import MRG32k3aRandomNumberGenerator
+
             generator_types.extend([MRG32k3aRandomNumberGenerator])
 
         if has_double_support():
@@ -303,7 +298,7 @@ class TestGPUArray:
                     v = 10
                     a.fill(v)
                     gen.fill_poisson(a)
-                    tmp = (a.get() == (v-1)).sum() / a.size
+                    tmp = (a.get() == (v - 1)).sum() / a.size  # noqa: F841
                     # Commented out for CI on the off chance it'd fail
                     # # Check Poisson statistics (need 1e6 values)
                     # # Compare with scipy.stats.poisson.pmf(v - 1, v)
@@ -394,11 +389,12 @@ class TestGPUArray:
         def make_nan_contaminated_vector(size):
             shape = (size,)
             a = np.random.randn(*shape).astype(np.float32)
-            #for i in range(0, shape[0], 3):
-                #a[i] = float('nan')
+            # for i in range(0, shape[0], 3):
+            # a[i] = float('nan')
             from random import randrange
-            for i in range(size//10):
-                a[randrange(0, size)] = float('nan')
+
+            for i in range(size // 10):
+                a[randrange(0, size)] = float("nan")
             return a
 
         size = 1 << 20
@@ -408,8 +404,8 @@ class TestGPUArray:
         b = make_nan_contaminated_vector(size)
         b_gpu = gpuarray.to_gpu(b)
 
-        ab = a*b
-        ab_gpu = (a_gpu*b_gpu).get()
+        ab = a * b
+        ab_gpu = (a_gpu * b_gpu).get()
 
         assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
 
@@ -421,30 +417,32 @@ class TestGPUArray:
         b_gpu = curand((50,))
 
         from pycuda.elementwise import ElementwiseKernel
+
         lin_comb = ElementwiseKernel(
-                "float a, float *x, float b, float *y, float *z",
-                "z[i] = a*x[i] + b*y[i]",
-                "linear_combination")
+            "float a, float *x, float b, float *y, float *z",
+            "z[i] = a*x[i] + b*y[i]",
+            "linear_combination",
+        )
 
         c_gpu = gpuarray.empty_like(a_gpu)
         lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
 
-        assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+        assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
 
     @mark_cuda_test
     def test_ranged_elwise_kernel(self):
         from pycuda.elementwise import ElementwiseKernel
-        set_to_seven = ElementwiseKernel(
-                "float *z",
-                "z[i] = 7",
-                "set_to_seven")
 
-        for i, slc in enumerate([
+        set_to_seven = ElementwiseKernel("float *z", "z[i] = 7", "set_to_seven")
+
+        for i, slc in enumerate(
+            [
                 slice(5, 20000),
                 slice(5, 20000, 17),
                 slice(3000, 5, -1),
                 slice(1000, -1),
-                ]):
+            ]
+        ):
 
             a_gpu = gpuarray.zeros((50000,), dtype=np.float32)
             a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
@@ -480,11 +478,12 @@ class TestGPUArray:
         b = a_cpu.get()
 
         for i in range(0, 10):
-            assert a[len(a)-1-i] == b[i]
+            assert a[len(a) - 1 - i] == b[i]
 
     @mark_cuda_test
     def test_sum(self):
         from pycuda.curandom import rand as curand
+
         a_gpu = curand((200000,))
         a = a_gpu.get()
 
@@ -492,7 +491,7 @@ class TestGPUArray:
 
         sum_a_gpu = gpuarray.sum(a_gpu).get()
 
-        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
+        assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
 
     @mark_cuda_test
     def test_minmax(self):
@@ -550,37 +549,55 @@ class TestGPUArray:
     @mark_cuda_test
     def test_dot(self):
         from pycuda.curandom import rand as curand
-        for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129,
-                255, 256, 257, 16384 - 993,
-                20000]:
-            a_gpu = curand((l,))
+
+        for sz in [
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            31,
+            32,
+            33,
+            127,
+            128,
+            129,
+            255,
+            256,
+            257,
+            16384 - 993,
+            20000,
+        ]:
+            a_gpu = curand((sz,))
             a = a_gpu.get()
-            b_gpu = curand((l,))
+            b_gpu = curand((sz,))
             b = b_gpu.get()
 
             dot_ab = np.dot(a, b)
 
             dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get()
 
-            assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
+            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
 
     @mark_cuda_test
     def test_slice(self):
         from pycuda.curandom import rand as curand
 
-        l = 20000
-        a_gpu = curand((l,))
+        sz = 20000
+        a_gpu = curand((sz,))
         a = a_gpu.get()
 
         from random import randrange
+
         for i in range(200):
-            start = randrange(l)
-            end = randrange(start, l)
+            start = randrange(sz)
+            end = randrange(start, sz)
 
             a_gpu_slice = a_gpu[start:end]
             a_slice = a[start:end]
 
-            assert la.norm(a_gpu_slice.get()-a_slice) == 0
+            assert la.norm(a_gpu_slice.get() - a_slice) == 0
 
     @mark_cuda_test
     def test_2d_slice_c(self):
@@ -592,6 +609,7 @@ class TestGPUArray:
         a = a_gpu.get()
 
         from random import randrange
+
         for i in range(200):
             start = randrange(n)
             end = randrange(start, n)
@@ -599,7 +617,7 @@ class TestGPUArray:
             a_gpu_slice = a_gpu[start:end]
             a_slice = a[start:end]
 
-            assert la.norm(a_gpu_slice.get()-a_slice) == 0
+            assert la.norm(a_gpu_slice.get() - a_slice) == 0
 
     @mark_cuda_test
     def test_2d_slice_f(self):
@@ -609,12 +627,13 @@ class TestGPUArray:
         n = 1000
         m = 300
         a_gpu = curand((n, m))
-        a_gpu_f = gpuarray.GPUArray((m, n), np.float32,
-                                    gpudata=a_gpu.gpudata,
-                                    order="F")
+        a_gpu_f = gpuarray.GPUArray(
+            (m, n), np.float32, gpudata=a_gpu.gpudata, order="F"
+        )
         a = a_gpu_f.get()
 
         from random import randrange
+
         for i in range(200):
             start = randrange(n)
             end = randrange(start, n)
@@ -622,15 +641,15 @@ class TestGPUArray:
             a_gpu_slice = a_gpu_f[:, start:end]
             a_slice = a[:, start:end]
 
-            assert la.norm(a_gpu_slice.get()-a_slice) == 0
+            assert la.norm(a_gpu_slice.get() - a_slice) == 0
 
     @mark_cuda_test
     def test_if_positive(self):
         from pycuda.curandom import rand as curand
 
-        l = 20
-        a_gpu = curand((l,))
-        b_gpu = curand((l,))
+        sz = 20
+        a_gpu = curand((sz,))
+        b_gpu = curand((sz,))
         a = a_gpu.get()
         b = b_gpu.get()
 
@@ -639,8 +658,8 @@ class TestGPUArray:
         max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu)
         min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu)
 
-        print (max_a_b_gpu)
-        print((np.maximum(a, b)))
+        print(max_a_b_gpu)
+        print(np.maximum(a, b))
 
         assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
         assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
@@ -649,18 +668,21 @@ class TestGPUArray:
     def test_take_put(self):
         for n in [5, 17, 333]:
             one_field_size = 8
-            buf_gpu = gpuarray.zeros(n*one_field_size, dtype=np.float32)
-            dest_indices = gpuarray.to_gpu(np.array(
-                [0,  1,  2,  3, 32, 33, 34, 35], dtype=np.uint32))
+            buf_gpu = gpuarray.zeros(n * one_field_size, dtype=np.float32)
+            dest_indices = gpuarray.to_gpu(
+                np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)
+            )
             read_map = gpuarray.to_gpu(
-                    np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
+                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)
+            )
 
             gpuarray.multi_take_put(
-                    arrays=[buf_gpu for i in range(n)],
-                    dest_indices=dest_indices,
-                    src_indices=read_map,
-                    src_offsets=[i*one_field_size for i in range(n)],
-                    dest_shape=(96,))
+                arrays=[buf_gpu for i in range(n)],
+                dest_indices=dest_indices,
+                src_indices=read_map,
+                src_offsets=[i * one_field_size for i in range(n)],
+                dest_shape=(96,),
+            )
 
             drv.Context.synchronize()
 
@@ -685,7 +707,7 @@ class TestGPUArray:
         a2 = a_gpu.astype(np.float32).get()
 
         assert a2.dtype == np.float32
-        assert la.norm(a - a2)/la.norm(a) < 1e-7
+        assert la.norm(a - a2) / la.norm(a) < 1e-7
 
     @mark_cuda_test
     def test_complex_bits(self):
@@ -700,10 +722,12 @@ class TestGPUArray:
         for tp in dtypes:
             dtype = np.dtype(tp)
             from pytools import match_precision
+
             real_dtype = match_precision(np.dtype(np.float64), dtype)
 
-            z = (curand((n,), real_dtype).astype(dtype)
-                    + 1j*curand((n,), real_dtype).astype(dtype))
+            z = curand((n,), real_dtype).astype(dtype) + 1j * curand(
+                (n,), real_dtype
+            ).astype(dtype)
 
             assert la.norm(z.get().real - z.real.get()) == 0
             assert la.norm(z.get().imag - z.imag.get()) == 0
@@ -712,57 +736,61 @@ class TestGPUArray:
             # verify contiguity is preserved
             for order in ["C", "F"]:
                 # test both zero and non-zero value code paths
-                z_real = gpuarray.zeros(z.shape, dtype=real_dtype,
-                                        order=order)
+                z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order)
                 z2 = z.reshape(z.shape, order=order)
                 for zdata in [z_real, z2]:
                     if order == "C":
-                        assert zdata.flags.c_contiguous == True
-                        assert zdata.real.flags.c_contiguous == True
-                        assert zdata.imag.flags.c_contiguous == True
-                        assert zdata.conj().flags.c_contiguous == True
+                        assert zdata.flags.c_contiguous
+                        assert zdata.real.flags.c_contiguous
+                        assert zdata.imag.flags.c_contiguous
+                        assert zdata.conj().flags.c_contiguous
                     elif order == "F":
-                        assert zdata.flags.f_contiguous == True
-                        assert zdata.real.flags.f_contiguous == True
-                        assert zdata.imag.flags.f_contiguous == True
-                        assert zdata.conj().flags.f_contiguous == True
-
+                        assert zdata.flags.f_contiguous
+                        assert zdata.real.flags.f_contiguous
+                        assert zdata.imag.flags.f_contiguous
+                        assert zdata.conj().flags.f_contiguous
 
     @mark_cuda_test
     def test_pass_slice_to_kernel(self):
-        mod = SourceModule("""
+        mod = SourceModule(
+            """
         __global__ void twice(float *a)
         {
           const int i = threadIdx.x + blockIdx.x * blockDim.x;
           a[i] *= 2;
         }
-        """)
+        """
+        )
 
         multiply_them = mod.get_function("twice")
 
-        a = np.ones(256**2, np.float32)
+        a = np.ones(256 ** 2, np.float32)
         a_gpu = gpuarray.to_gpu(a)
 
         multiply_them(a_gpu[256:-256], block=(256, 1, 1), grid=(254, 1))
 
         a = a_gpu.get()
         assert (a[255:257] == np.array([1, 2], np.float32)).all()
-        assert (a[255*256-1:255*256+1] == np.array([2, 1], np.float32)).all()
+        assert (a[255 * 256 - 1: 255 * 256 + 1] == np.array([2, 1], np.float32)).all()
 
     @mark_cuda_test
     def test_scan(self):
         from pycuda.scan import ExclusiveScanKernel, InclusiveScanKernel
+
         for cls in [ExclusiveScanKernel, InclusiveScanKernel]:
             scan_kern = cls(np.int32, "a+b", "0")
 
             for n in [
-                    10, 2**10-5, 2**10,
-                    2**20-2**18,
-                    2**20-2**18+5,
-                    2**10+5,
-                    2**20+5,
-                    2**20, 2**24
-                    ]:
+                10,
+                2 ** 10 - 5,
+                2 ** 10,
+                2 ** 20 - 2 ** 18,
+                2 ** 20 - 2 ** 18 + 5,
+                2 ** 10 + 5,
+                2 ** 20 + 5,
+                2 ** 20,
+                2 ** 24,
+            ]:
                 host_data = np.random.randint(0, 10, n).astype(np.int32)
                 gpu_data = gpuarray.to_gpu(host_data)
 
@@ -807,7 +835,7 @@ class TestGPUArray:
         # using -1 as unknown dimension
         assert a_gpu.reshape(-1, 32).shape == (4, 32)
         assert a_gpu.reshape((32, -1)).shape == (32, 4)
-        assert a_gpu.reshape(((8, -1, 4))).shape == (8, 4, 4)
+        assert a_gpu.reshape((8, -1, 4)).shape == (8, 4, 4)
 
         throws_exception = False
         try:
@@ -817,11 +845,11 @@ class TestGPUArray:
         assert throws_exception
 
         # with order specified
-        a_gpu = a_gpu.reshape((4, 32), order='C')
+        a_gpu = a_gpu.reshape((4, 32), order="C")
         assert a_gpu.flags.c_contiguous
-        a_gpu = a_gpu.reshape(4, 32, order='F')
+        a_gpu = a_gpu.reshape(4, 32, order="F")
         assert a_gpu.flags.f_contiguous
-        a_gpu = a_gpu.reshape((4, 32), order='F')
+        a_gpu = a_gpu.reshape((4, 32), order="F")
         assert a_gpu.flags.f_contiguous
         # default is C-contiguous
         a_gpu = a_gpu.reshape((4, 32))
@@ -851,26 +879,26 @@ class TestGPUArray:
         a_gpu = gpuarray.to_gpu(a_cpu)
 
         # Slice with length 1 on dimensions 0 and 1
-        a_gpu_slice = a_gpu[0:1,1:2,:,:]
-        assert a_gpu_slice.shape == (1,1,shape[2],shape[3])
+        a_gpu_slice = a_gpu[0:1, 1:2, :, :]
+        assert a_gpu_slice.shape == (1, 1, shape[2], shape[3])
         assert a_gpu_slice.flags.c_contiguous
 
         # Squeeze it and obtain contiguity
-        a_gpu_squeezed_slice = a_gpu[0:1,1:2,:,:].squeeze()
-        assert a_gpu_squeezed_slice.shape == (shape[2],shape[3])
+        a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze()
+        assert a_gpu_squeezed_slice.shape == (shape[2], shape[3])
         assert a_gpu_squeezed_slice.flags.c_contiguous
 
         # Check that we get the original values out
         assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
 
         # Slice with length 1 on dimensions 2
-        a_gpu_slice = a_gpu[:,:,2:3,:]
-        assert a_gpu_slice.shape == (shape[0],shape[1],1,shape[3])
+        a_gpu_slice = a_gpu[:, :, 2:3, :]
+        assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3])
         assert not a_gpu_slice.flags.c_contiguous
 
         # Squeeze it, but no contiguity here
-        a_gpu_squeezed_slice = a_gpu[:,:,2:3,:].squeeze()
-        assert a_gpu_squeezed_slice.shape == (shape[0],shape[1],shape[3])
+        a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze()
+        assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3])
         assert not a_gpu_squeezed_slice.flags.c_contiguous
 
         # Check that we get the original values out
@@ -921,22 +949,29 @@ class TestGPUArray:
         mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)])
 
         from pycuda.curandom import rand as curand
+
         a_gpu = curand((20000,), dtype=np.float32)
         a = a_gpu.get()
 
         from pycuda.tools import register_dtype
+
         register_dtype(mmc_dtype, "minmax_collector")
 
         from pycuda.reduction import ReductionKernel
-        red = ReductionKernel(mmc_dtype,
-                neutral="minmax_collector(10000, -10000)",
-                # FIXME: needs infinity literal in real use, ok here
-                reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])",
-                arguments="float *x", preamble=preamble)
+
+        red = ReductionKernel(
+            mmc_dtype,
+            neutral="minmax_collector(10000, -10000)",
+            # FIXME: needs infinity literal in real use, ok here
+            reduce_expr="agg_mmc(a, b)",
+            map_expr="minmax_collector(x[i], x[i])",
+            arguments="float *x",
+            preamble=preamble,
+        )
 
         minmax = red(a_gpu).get()
-        #print minmax["cur_min"], minmax["cur_max"]
-        #print np.min(a), np.max(a)
+        # print minmax["cur_min"], minmax["cur_max"]
+        # print np.min(a), np.max(a)
 
         assert minmax["cur_min"] == np.min(a)
         assert minmax["cur_max"] == np.max(a)
@@ -944,13 +979,15 @@ class TestGPUArray:
     @mark_cuda_test
     def test_reduce_out(self):
         from pycuda.curandom import rand as curand
+
         a_gpu = curand((10, 200), dtype=np.float32)
         a = a_gpu.get()
 
         from pycuda.reduction import ReductionKernel
-        red = ReductionKernel(np.float32, neutral=0,
-                              reduce_expr="max(a,b)",
-                              arguments="float *in")
+
+        red = ReductionKernel(
+            np.float32, neutral=0, reduce_expr="max(a,b)", arguments="float *in"
+        )
         max_gpu = gpuarray.empty(10, dtype=np.float32)
         for i in range(10):
             red(a_gpu[i], out=max_gpu[i])
@@ -961,22 +998,24 @@ class TestGPUArray:
     def test_sum_allocator(self):
         # FIXME
         from pytest import skip
+
         skip("https://github.com/inducer/pycuda/issues/163")
         # crashes with  terminate called after throwing an instance of 'pycuda::error'
         # what():  explicit_context_dependent failed: invalid device context - no currently active context?
 
         import pycuda.tools
+
         pool = pycuda.tools.DeviceMemoryPool()
 
-        rng = np.random.randint(low=512,high=1024)
+        rng = np.random.randint(low=512, high=1024)
 
-        a = gpuarray.arange(rng,dtype=np.int32)
+        a = gpuarray.arange(rng, dtype=np.int32)
         b = gpuarray.sum(a)
         c = gpuarray.sum(a, allocator=pool.allocate)
 
         # Test that we get the correct results
-        assert b.get() == rng*(rng-1)//2
-        assert c.get() == rng*(rng-1)//2
+        assert b.get() == rng * (rng - 1) // 2
+        assert c.get() == rng * (rng - 1) // 2
 
         # Test that result arrays were allocated with the appropriate allocator
         assert b.allocator == a.allocator
@@ -986,13 +1025,15 @@ class TestGPUArray:
     def test_dot_allocator(self):
         # FIXME
         from pytest import skip
+
         skip("https://github.com/inducer/pycuda/issues/163")
 
         import pycuda.tools
+
         pool = pycuda.tools.DeviceMemoryPool()
 
-        a_cpu = np.random.randint(low=512,high=1024,size=1024)
-        b_cpu = np.random.randint(low=512,high=1024,size=1024)
+        a_cpu = np.random.randint(low=512, high=1024, size=1024)
+        b_cpu = np.random.randint(low=512, high=1024, size=1024)
 
         # Compute the result on the CPU
         dot_cpu_1 = np.dot(a_cpu, b_cpu)
@@ -1012,7 +1053,6 @@ class TestGPUArray:
         assert dot_gpu_1.allocator == a_gpu.allocator
         assert dot_gpu_2.allocator == pool.allocate
 
-
     @mark_cuda_test
     def test_view_and_strides(self):
         from pycuda.curandom import rand as curand
@@ -1051,8 +1091,8 @@ class TestGPUArray:
     def test_minimum_maximum_scalar(self):
         from pycuda.curandom import rand as curand
 
-        l = 20
-        a_gpu = curand((l,))
+        sz = 20
+        a_gpu = curand((sz,))
         a = a_gpu.get()
 
         import pycuda.gpuarray as gpuarray
@@ -1065,25 +1105,23 @@ class TestGPUArray:
 
     @mark_cuda_test
     def test_transpose(self):
-        import pycuda.gpuarray as gpuarray
         from pycuda.curandom import rand as curand
 
-        a_gpu = curand((10,20,30))
+        a_gpu = curand((10, 20, 30))
         a = a_gpu.get()
 
-        #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous
+        # assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous
         assert np.allclose(a_gpu.T.get(), a.T)
 
     @mark_cuda_test
     def test_newaxis(self):
-        import pycuda.gpuarray as gpuarray
         from pycuda.curandom import rand as curand
 
-        a_gpu = curand((10,20,30))
+        a_gpu = curand((10, 20, 30))
         a = a_gpu.get()
 
-        b_gpu = a_gpu[:,np.newaxis]
-        b = a[:,np.newaxis]
+        b_gpu = a_gpu[:, np.newaxis]
+        b = a[:, np.newaxis]
 
         assert b_gpu.shape == b.shape
         assert b_gpu.strides == b.strides
@@ -1091,43 +1129,55 @@ class TestGPUArray:
     @mark_cuda_test
     def test_copy(self):
         from pycuda.curandom import rand as curand
-        a_gpu = curand((3,3))
-
-        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
-            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])
-
-        a_gpu = curand((3,1))
-        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
-            assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step])
 
-        a_gpu = curand((3,3,3))
-        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
-            assert np.allclose(a_gpu[start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step])
-
-        a_gpu = curand((3,3,3)).transpose((1,2,0))
-        a = a_gpu.get()
-        for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]:
-            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
+        a_gpu = curand((3, 3))
+
+        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
+            assert np.allclose(
+                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]
+            )
+
+        a_gpu = curand((3, 1))
+        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
+            assert np.allclose(
+                a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]
+            )
+
+        a_gpu = curand((3, 3, 3))
+        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
+            assert np.allclose(
+                a_gpu[start:stop:step, start:stop:step].get(),
+                a_gpu.get()[start:stop:step, start:stop:step],
+            )
+
+        a_gpu = curand((3, 3, 3)).transpose((1, 2, 0))
+        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]:
+            assert np.allclose(
+                a_gpu[start:stop:step, :, start:stop:step].get(),
+                a_gpu.get()[start:stop:step, :, start:stop:step],
+            )
 
         # 4-d should work as long as only 2 axes are discontiguous
-        a_gpu = curand((3,3,3,3))
-        a = a_gpu.get()
-        for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
-            assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
+        a_gpu = curand((3, 3, 3, 3))
+        for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 3)]:
+            assert np.allclose(
+                a_gpu[start:stop:step, :, start:stop:step].get(),
+                a_gpu.get()[start:stop:step, :, start:stop:step],
+            )
 
     @mark_cuda_test
     def test_get_set(self):
         import pycuda.gpuarray as gpuarray
 
-        a = np.random.normal(0., 1., (4,4))
+        a = np.random.normal(0.0, 1.0, (4, 4))
         a_gpu = gpuarray.to_gpu(a)
         assert np.allclose(a_gpu.get(), a)
-        assert np.allclose(a_gpu[1:3,1:3].get(), a[1:3,1:3])
+        assert np.allclose(a_gpu[1:3, 1:3].get(), a[1:3, 1:3])
 
-        a = np.random.normal(0., 1., (4,4,4)).transpose((1,2,0))
+        a = np.random.normal(0.0, 1.0, (4, 4, 4)).transpose((1, 2, 0))
         a_gpu = gpuarray.to_gpu(a)
         assert np.allclose(a_gpu.get(), a)
-        assert np.allclose(a_gpu[1:3,1:3,1:3].get(), a[1:3,1:3,1:3])
+        assert np.allclose(a_gpu[1:3, 1:3, 1:3].get(), a[1:3, 1:3, 1:3])
 
     @mark_cuda_test
     def test_zeros_like_etc(self):
@@ -1135,11 +1185,9 @@ class TestGPUArray:
         a = np.random.randn(*shape).astype(np.float32)
         z = gpuarray.to_gpu(a)
         zf = gpuarray.to_gpu(np.asfortranarray(a))
-        a_noncontig = np.arange(3*4*5).reshape(3, 4, 5).swapaxes(1, 2)
+        a_noncontig = np.arange(3 * 4 * 5).reshape(3, 4, 5).swapaxes(1, 2)
         z_noncontig = gpuarray.to_gpu(a_noncontig)
-        for func in [gpuarray.empty_like,
-                     gpuarray.zeros_like,
-                     gpuarray.ones_like]:
+        for func in [gpuarray.empty_like, gpuarray.zeros_like, gpuarray.ones_like]:
             for arr in [z, zf, z_noncontig]:
 
                 contig = arr.flags.c_contiguous or arr.flags.f_contiguous
@@ -1184,7 +1232,8 @@ if __name__ == "__main__":
     import pycuda.autoinit  # noqa
 
     if len(sys.argv) > 1:
-        exec (sys.argv[1])
+        exec(sys.argv[1])
     else:
         from pytest import main
+
         main([__file__])
diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py
index 7d609e249aec722b910a9ab8dd06cafd58ee5798..dc2c28211559bf00e316d6a1f6536512e73a17e3 100644
--- a/test/undistributed/elwise-perf.py
+++ b/test/undistributed/elwise-perf.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from __future__ import print_function
+
 #! /usr/bin/env python
 import pycuda.driver as drv
 import pycuda.autoinit
@@ -8,10 +9,9 @@ import numpy.linalg as la
 from six.moves import range
 
 
-
-
 def main():
     from pytools import Table
+
     tbl = Table()
     tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]"))
 
@@ -19,7 +19,7 @@ def main():
 
     # they're floats, i.e. 4 bytes each
     for power in range(10, 28):
-        size = 1<<power
+        size = 1 << power
         print(size)
 
         a = gpuarray.empty((size,), dtype=numpy.float32)
@@ -40,17 +40,15 @@ def main():
         for i in range(count):
             a.mul_add(1, b, 2, add_timer)
 
-        bytes = a.nbytes*count*3
-        bytes = a.nbytes*count*3
+        bytes = a.nbytes * count * 3
+        bytes = a.nbytes * count * 3
 
-        tbl.add_row((a.nbytes/(1<<20), elapsed[0]/count, bytes/elapsed[0]/1e9))
+        tbl.add_row(
+            (a.nbytes / (1 << 20), elapsed[0] / count, bytes / elapsed[0] / 1e9)
+        )
 
     print(tbl)
-        
-
-
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/test/undistributed/measure_gpuarray_speed.py b/test/undistributed/measure_gpuarray_speed.py
index c2e33c5f4fc54e21c8fdc092271b06a65bde91d4..26a20f41b65ac3b6b7da4eb1bc13612b261db2af 100755
--- a/test/undistributed/measure_gpuarray_speed.py
+++ b/test/undistributed/measure_gpuarray_speed.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from __future__ import print_function
+
 #! /usr/bin/env python
 import pycuda.driver as drv
 import pycuda.autoinit
@@ -9,8 +10,6 @@ from six.moves import range
 from six.moves import zip
 
 
-
-
 def main():
     import pycuda.gpuarray as gpuarray
 
@@ -21,10 +20,11 @@ def main():
     times_cpu = []
 
     from pycuda.tools import bitlog2
+
     max_power = bitlog2(drv.mem_get_info()[0]) - 2
     # they're floats, i.e. 4 bytes each
     for power in range(10, max_power):
-        size = 1<<power
+        size = 1 << power
         print(size)
         sizes.append(size)
         a = gpuarray.zeros((size,), dtype=numpy.float32)
@@ -42,14 +42,14 @@ def main():
         start.record()
 
         for i in range(count):
-            a+b
+            a + b
 
         end.record()
         end.synchronize()
 
-        secs = start.time_till(end)*1e-3
+        secs = start.time_till(end) * 1e-3
 
-        times_gpu.append(secs/count)
+        times_gpu.append(secs / count)
         flops_gpu.append(size)
         del a
         del b
@@ -58,32 +58,38 @@ def main():
         a_cpu = numpy.random.randn(size).astype(numpy.float32)
         b_cpu = numpy.random.randn(size).astype(numpy.float32)
 
-        #start timer
+        # start timer
         from time import time
+
         start = time()
         for i in range(count):
             a_cpu + b_cpu
         secs = time() - start
 
-        times_cpu.append(secs/count)
+        times_cpu.append(secs / count)
         flops_cpu.append(size)
 
-
     # calculate pseudo flops
-    flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)]
-    flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)]
+    flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)]
+    flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)]
 
     from pytools import Table
+
     tbl = Table()
-    tbl.add_row(("Size", "Time GPU", "Size/Time GPU",
-        "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
+    tbl.add_row(
+        (
+            "Size",
+            "Time GPU",
+            "Size/Time GPU",
+            "Time CPU",
+            "Size/Time CPU",
+            "GPU vs CPU speedup",
+        )
+    )
     for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu):
-        tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu))
+        tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu))
     print(tbl)
 
 
-
-
-
 if __name__ == "__main__":
     main()
diff --git a/test/undistributed/reduction-perf.py b/test/undistributed/reduction-perf.py
index de81ff92e8b7dad93bca38dbe1cf9c944861615d..842824cdb463e6d8c669aa1fd386259eed862440 100644
--- a/test/undistributed/reduction-perf.py
+++ b/test/undistributed/reduction-perf.py
@@ -8,25 +8,28 @@ import numpy
 from six.moves import range
 
 
-
 def main():
     from pytools import Table
+
     tbl = Table()
     tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))
 
     from random import shuffle
+
     for dtype_out in [numpy.float32, numpy.float64]:
-        for ex in range(15,27):
+        for ex in range(15, 27):
             sz = 1 << ex
             print(sz)
 
             from pycuda.curandom import rand as curand
+
             a_gpu = curand((sz,))
             b_gpu = curand((sz,))
             assert sz == a_gpu.shape[0]
             assert len(a_gpu.shape) == 1
 
             from pycuda.reduction import get_sum_kernel, get_dot_kernel
+
             krnl = get_dot_kernel(dtype_out, a_gpu.dtype)
 
             elapsed = [0]
@@ -50,16 +53,27 @@ def main():
             cnt = 10
 
             for i in range(cnt):
-                krnl(a_gpu, b_gpu,
-                #krnl(a_gpu, 
-                        kernel_wrapper=wrap_with_timer)
-
-            bytes = a_gpu.nbytes*2*cnt
-            secs = elapsed[0]*1e-3
-
-            tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9))
+                krnl(
+                    a_gpu,
+                    b_gpu,
+                    # krnl(a_gpu,
+                    kernel_wrapper=wrap_with_timer,
+                )
+
+            bytes = a_gpu.nbytes * 2 * cnt
+            secs = elapsed[0] * 1e-3
+
+            tbl.add_row(
+                (
+                    str(dtype_out),
+                    a_gpu.nbytes / (1 << 20),
+                    elapsed[0] / cnt,
+                    bytes / secs / 1e9,
+                )
+            )
 
     print(tbl)
 
+
 if __name__ == "__main__":
     main()