diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c38d0153e021384d840060b03cb680bcdb56f39c..32f795f9c6b3b6560c42f0b492e9364975a78b6d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,17 +8,16 @@ on:
         - cron:  '17 3 * * 0'
 
 jobs:
-    flake8:
-        name: Flake8
+    ruff:
+        name: Ruff
         runs-on: ubuntu-latest
         steps:
         -   uses: actions/checkout@v3
         -
             uses: actions/setup-python@v4
             with:
-                # matches compat target in setup.py
-                python-version: '3.8'
+                python-version: '3.x'
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
-                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py
+                pipx install ruff
+                ruff check
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4f4d93b8c1193a4ed134af6c20afaf393d3fa64a..cc92cb031e970e6256ed1f7c7e33f67adbc7bd27 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,17 +1,3 @@
-Python 3 Titan X:
-  script:
-  - py_version=3
-  - EXTRA_INSTALL="numpy mako"
-  - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
-  - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python3
-  - nvidia-titan-x
-  except:
-  - tags
-
 Python 3 Titan V:
   script:
   - py_version=3
@@ -26,21 +12,6 @@ Python 3 Titan V:
   except:
   - tags
 
-Python 3 K40:
-  script:
-  - py_version=3
-  - EXTRA_INSTALL="numpy mako"
-  - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
-  - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
-  - ". ./build-and-test-py-project.sh"
-
-  tags:
-  - python3
-  - nvidia-k40
-  except:
-  - tags
-
 Documentation:
   script: |
     EXTRA_INSTALL="numpy mako"
@@ -50,15 +21,15 @@ Documentation:
      build_docs --no-check
      maybe_upload_docs
   tags:
-  - python3
   # needs CUDA headers to compile
-  - nvidia-titan-x
+  - nvidia-titan-v
+  - python3
 
-Flake8:
+Ruff:
   script:
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
-  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py
+  - pipx install ruff
+  - ruff check
   tags:
-  - python3
+  - docker-runner
   except:
   - tags
diff --git a/aksetup_helper.py b/aksetup_helper.py
index 57e82ebd96c75c3dab86d96431f1a3b6b3a10659..32f80cdec881142ee4bca9a7a46d42677a745d70 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -1,9 +1,14 @@
+from __future__ import annotations
+
 import os
 import sys
+
+
 try:
     from setuptools import Extension
     from setuptools.command.build_ext import (  # noqa: N812
-            build_ext as BaseBuildExtCommand)
+        build_ext as BaseBuildExtCommand,
+    )
 
 except ImportError:
     class Extension:
@@ -43,8 +48,8 @@ def setup(*args, **kwargs):
 
 
 def get_numpy_incpath():
-    from os.path import join, dirname, exists
     from importlib.util import find_spec
+    from os.path import dirname, exists, join
     origin = find_spec("numpy").origin
     if origin is None:
         raise RuntimeError("origin of numpy package not found")
@@ -294,7 +299,7 @@ class ConfigSchema:
     def read_config_from_pyfile(self, filename):
         result = {}
         filevars = {}
-        infile = open(filename, "r")
+        infile = open(filename)
         try:
             contents = infile.read()
         finally:
@@ -313,12 +318,11 @@ class ConfigSchema:
         filevars = {}
 
         try:
-            exec(compile(open(filename, "r").read(), filename, "exec"), filevars)
-        except IOError:
+            exec(compile(open(filename).read(), filename, "exec"), filevars)
+        except OSError:
             pass
 
-        if "__builtins__" in filevars:
-            del filevars["__builtins__"]
+        filevars.pop("__builtins__", None)
 
         for key, value in config.items():
             if value is not None:
@@ -405,7 +409,7 @@ class ConfigSchema:
 
         result = self.get_default_config_with_files()
         if os.access(cfile, os.R_OK):
-            with open(cfile, "r") as inf:
+            with open(cfile) as inf:
                 py_snippet = inf.read()
             self.update_from_python_snippet(result, py_snippet, cfile)
 
@@ -438,7 +442,7 @@ class ConfigSchema:
                 for opt in self.options)
 
 
-class Option(object):
+class Option:
     def __init__(self, name, default=None, help=None):
         self.name = name
         self.default = default
@@ -734,7 +738,7 @@ def substitute(substitutions, fname):
     string_var_re = re.compile(r"\$str\{([A-Za-z_0-9]+)\}")
 
     fname_in = fname+".in"
-    with open(fname_in, "r") as inf:
+    with open(fname_in) as inf:
         lines = inf.readlines()
 
     new_lines = []
@@ -769,7 +773,7 @@ def substitute(substitutions, fname):
     with open(fname, "w") as outf:
         outf.write("".join(new_lines))
 
-    from os import stat, chmod
+    from os import chmod, stat
     infile_stat_res = stat(fname_in)
     chmod(fname, infile_stat_res.st_mode)
 
@@ -778,7 +782,7 @@ def substitute(substitutions, fname):
 
 def _run_git_command(cmd):
     git_error = None
-    from subprocess import Popen, PIPE
+    from subprocess import PIPE, Popen
     stdout = None
     try:
         popen = Popen(["git"] + cmd, stdout=PIPE)
@@ -909,7 +913,7 @@ def check_pybind11():
 
 # {{{ (modified) boilerplate from https://github.com/pybind/python_example/blob/2ed5a68759cd6ff5d2e5992a91f08616ef457b5c/setup.py  # noqa
 
-class get_pybind_include(object):  # noqa: N801
+class get_pybind_include:  # noqa: N801
     """Helper class to determine the pybind11 include path
 
     The purpose of this class is to postpone importing pybind11
diff --git a/configure.py b/configure.py
index 85c9841d9f995805d8b64bb8aae976b6ecc43ba3..c3400879412b730b92243669856f53fc8c76a51b 100755
--- a/configure.py
+++ b/configure.py
@@ -1,4 +1,7 @@
 #! /usr/bin/env python3
+from __future__ import annotations
 
 from aksetup_helper import configure_frontend
+
+
 configure_frontend()
diff --git a/doc/conf.py b/doc/conf.py
index 5796b12c456d0a1a7dd7d907374b0e2224cdd298..8dfb883be448c29a952204253ee67bdaa09a2fba 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,5 +1,8 @@
+from __future__ import annotations
+
 from urllib.request import urlopen
 
+
 _conf_url = \
         "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
 with urlopen(_conf_url) as _inf:
@@ -8,10 +11,10 @@ with urlopen(_conf_url) as _inf:
 copyright = "2008-21, Andreas Kloeckner"
 
 ver_dic = {}
+with open("../pycuda/__init__.py") as initf:
+    init_contents = initf.read()
 exec(
-    compile(
-        open("../pycuda/__init__.py").read(), "../pycuda/__init__.py", "exec"
-    ),
+    compile(init_contents, "../pycuda/__init__.py", "exec"),
     ver_dic,
 )
 version = ".".join(str(x) for x in ver_dic["VERSION"])
diff --git a/examples/cai_cupy_arrays.py b/examples/cai_cupy_arrays.py
index 2c524ba945a5bd61c593c1081b46a2ba1ddf080f..fbc276f2527083a20436d377c4cde7efa18f2e50 100644
--- a/examples/cai_cupy_arrays.py
+++ b/examples/cai_cupy_arrays.py
@@ -1,11 +1,12 @@
 # Copyright 2008-2021 Andreas Kloeckner
 # Copyright 2021 NVIDIA Corporation
+from __future__ import annotations
+
+import cupy as cp
 
 import pycuda.autoinit  # noqa
 from pycuda.compiler import SourceModule
 
-import cupy as cp
-
 
 # Create a CuPy array (and a copy for comparison later)
 cupy_a = cp.random.randn(4, 4).astype(cp.float32)
diff --git a/examples/cai_numba.py b/examples/cai_numba.py
index 0a94ee48f495e2a152d219ab7aaf02b8d7c87d1c..f42df68c637590f361908485c9dbe40042235d6e 100644
--- a/examples/cai_numba.py
+++ b/examples/cai_numba.py
@@ -1,16 +1,14 @@
 # Copyright 2008-2021 Andreas Kloeckner
 # Copyright 2021 NVIDIA Corporation
+from __future__ import annotations
 
+import numpy
 from numba import cuda
 
-import pycuda.driver as pycuda
 # We use autoprimaryctx instead of autoinit because Numba can only operate on a
 # primary context
-import pycuda.autoprimaryctx  # noqa
 import pycuda.gpuarray as gpuarray
 
-import numpy
-
 
 # Create a PyCUDA gpuarray
 a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
diff --git a/examples/demo.py b/examples/demo.py
index a1c404209e929a2c3126efeaf21d9efc48ca7bdb..3d49c92f5a4b68958db9667778059ac96238a447 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,9 +1,13 @@
 # Sample source code from the Tutorial Introduction in the documentation.
-import pycuda.driver as cuda
+from __future__ import annotations
+
+import numpy
+
 import pycuda.autoinit  # noqa
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
-import numpy
+
 a = numpy.random.randn(4, 4)
 
 a = a.astype(numpy.float32)
@@ -39,6 +43,8 @@ print(a)
 # part 2 ----------------------------------------------------------------------
 
 import pycuda.gpuarray as gpuarray
+
+
 a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
 a_doubled = (2*a_gpu).get()
 
diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
index 1814369f1a11056585487ad2be1e5029895256a0..f0908e243db7931282857a4a4479a68a4ff4deef 100644
--- a/examples/demo_cdpSimplePrint.py
+++ b/examples/demo_cdpSimplePrint.py
@@ -16,13 +16,16 @@
  * is strictly prohibited.
  * ---------------------------------------------------------------------------
 '''
+from __future__ import annotations
+
+import sys
 
-import sys, os
 import pycuda.autoinit
 import pycuda.driver as cuda
 from pycuda.compiler import DynamicSourceModule
 
-cdpSimplePrint_cu = '''
+
+cdpSimplePrint_cu = """
 #include <cstdio>
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -66,7 +69,7 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid
 {
   // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
   __shared__ int s_uid;
-  if( threadIdx.x == 0 ) 
+  if( threadIdx.x == 0 )
   {
     s_uid = atomicAdd( &g_uids, 1 );
   }
@@ -74,7 +77,7 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid
 
   // We print the ID of the block and information about its parent.
   print_info( depth, thread, s_uid, parent_uid );
-  
+
   // We launch new blocks if we haven't reached the max_depth yet.
   if( ++depth >= max_depth )
   {
@@ -82,7 +85,8 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid
   }
   cdp_kernel<<<gridDim.x, blockDim.x>>>( max_depth, depth, threadIdx.x, s_uid );
 }
-'''
+"""
+
 
 def main(argv):
     max_depth = 2
@@ -96,7 +100,7 @@ def main(argv):
     print("starting Simple Print (CUDA Dynamic Parallelism)")
 
     mod = DynamicSourceModule(cdpSimplePrint_cu)
-    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
+    cdp_kernel = mod.get_function("cdp_kernel").prepare("iiii").prepared_call
 
     print("***************************************************************************")
     print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
@@ -104,7 +108,7 @@ def main(argv):
     print("until it reaches max_depth=%d\n" % max_depth)
     print("In total 2")
     num_blocks, sum = 2, 2
-    for i in range(1, max_depth):
+    for _i in range(1, max_depth):
         num_blocks *= 4
         print("+%d" % num_blocks)
         sum += num_blocks
@@ -115,7 +119,8 @@ def main(argv):
     pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)
 
     print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
-    cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
+    cdp_kernel((2, 1), (2, 1, 1), max_depth, 0, 0, -1)
+
 
 if __name__ == "__main__":
     main(sys.argv)
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index 9baadc13a307d04d1c27b4c18acff5b50d0dbe65..8c361332baecd470dd1b42e13d31d8d1d1f5aa00 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,19 +1,24 @@
-import pycuda.gpuarray as gpuarray
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
+
+import pycuda.gpuarray as gpuarray
 from pycuda.curandom import rand as curand
 
+
 a_gpu = curand((50,))
 b_gpu = curand((50,))
 
 from pycuda.elementwise import ElementwiseKernel
+
+
 lin_comb = ElementwiseKernel(
         "float a, float *x, float b, float *y, float *z",
         "z[i] = my_f(a*x[i], b*y[i])",
         "linear_combination",
         preamble="""
         __device__ float my_f(float x, float y)
-        { 
+        {
           return sin(x*y);
         }
         """)
@@ -22,4 +27,6 @@ c_gpu = gpuarray.empty_like(a_gpu)
 lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
 
 import numpy.linalg as la
+
+
 assert la.norm(c_gpu.get() - numpy.sin((5*a_gpu*6*b_gpu).get())) < 1e-5
diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
index ddd89f5afafc891b75649854bb5bc2f72b5e00d4..e3ac5104da4d675187d12e2689d20b1dd151694d 100644
--- a/examples/demo_meta_codepy.py
+++ b/examples/demo_meta_codepy.py
@@ -1,9 +1,12 @@
-import pycuda.driver as cuda
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
 import numpy.linalg as la
+
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
+
 thread_strides = 16
 block_size = 256
 macroblock_count = 33
@@ -18,38 +21,46 @@ a_gpu = cuda.to_device(a)
 b_gpu = cuda.to_device(b)
 c_gpu = cuda.mem_alloc(a.nbytes)
 
-from cgen import FunctionBody, \
-        FunctionDeclaration, POD, Value, \
-        Pointer, Module, Block, Initializer, Assign
+from cgen import (
+    POD,
+    Assign,
+    Block,
+    FunctionBody,
+    FunctionDeclaration,
+    Initializer,
+    Module,
+    Pointer,
+    Value,
+)
 from cgen.cuda import CudaGlobal
 
+
 mod = Module([
     FunctionBody(
         CudaGlobal(FunctionDeclaration(
             Value("void", "add"),
-            arg_decls=[Pointer(POD(dtype, name)) 
+            arg_decls=[Pointer(POD(dtype, name))
                 for name in ["tgt", "op1", "op2"]])),
         Block([
             Initializer(
                 POD(numpy.int32, "idx"),
-                "threadIdx.x + %d*blockIdx.x" 
+                "threadIdx.x + %d*blockIdx.x"
                 % (block_size*thread_strides)),
             ]+[
             Assign(
                 "tgt[idx+%d]" % (o*block_size),
                 "op1[idx+%d] + op2[idx+%d]" % (
-                    o*block_size, 
+                    o*block_size,
                     o*block_size))
             for o in range(thread_strides)]))])
 
 mod = SourceModule(mod)
 
 func = mod.get_function("add")
-func(c_gpu, a_gpu, b_gpu, 
-        block=(block_size,1,1),
-        grid=(macroblock_count,1))
+func(c_gpu, a_gpu, b_gpu,
+        block=(block_size, 1, 1),
+        grid=(macroblock_count, 1))
 
 c = cuda.from_device_like(c_gpu, a)
 
 assert la.norm(c-(a+b)) == 0
-
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
index 4093434d81c1e66b31d99ec62944d0352c64b09d..d8ffeb79b9bb9ff98dd766b4a8977cef3e3b67be 100644
--- a/examples/demo_meta_template.py
+++ b/examples/demo_meta_template.py
@@ -1,9 +1,12 @@
-import pycuda.driver as cuda
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
 import numpy.linalg as la
+
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
+
 thread_strides = 16
 block_size = 32
 macroblock_count = 33
@@ -20,20 +23,21 @@ c_gpu = cuda.mem_alloc(a.nbytes)
 
 from jinja2 import Template
 
+
 tpl = Template("""
     __global__ void add(
-            {{ type_name }} *tgt, 
-            {{ type_name }} *op1, 
+            {{ type_name }} *tgt,
+            {{ type_name }} *op1,
             {{ type_name }} *op2)
     {
-      int idx = threadIdx.x + 
+      int idx = threadIdx.x +
         {{ block_size }} * {{thread_strides}}
         * blockIdx.x;
 
       {% for i in range(thread_strides) %}
           {% set offset = i*block_size %}
-          tgt[idx + {{ offset }}] = 
-            op1[idx + {{ offset }}] 
+          tgt[idx + {{ offset }}] =
+            op1[idx + {{ offset }}]
             + op2[idx + {{ offset }}];
       {% endfor %}
     }""")
@@ -46,9 +50,9 @@ mod = SourceModule(rendered_tpl)
 # end
 
 func = mod.get_function("add")
-func(c_gpu, a_gpu, b_gpu, 
-        block=(block_size,1,1),
-        grid=(macroblock_count,1))
+func(c_gpu, a_gpu, b_gpu,
+        block=(block_size, 1, 1),
+        grid=(macroblock_count, 1))
 
 c = cuda.from_device_like(c_gpu, a)
 
diff --git a/examples/demo_struct.py b/examples/demo_struct.py
index 37f890604f903188ecd0b3142d88cdd95f1c0077..a273e0e452e430d9ad47edd746b1beb44b267521 100644
--- a/examples/demo_struct.py
+++ b/examples/demo_struct.py
@@ -1,12 +1,17 @@
 # prepared invocations and structures -----------------------------------------
-import pycuda.driver as cuda
-import pycuda.autoinit
-import numpy
+from __future__ import annotations
+
 import struct
+
+import numpy
+
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
+
 class DoubleOpStruct:
     mem_size = 8 + numpy.uintp(0).nbytes
+
     def __init__(self, array, struct_arr_ptr):
         self.data = cuda.to_device(array)
         self.shape, self.dtype = array.shape, array.dtype
@@ -17,6 +22,7 @@ class DoubleOpStruct:
     def __str__(self):
         return str(cuda.from_device(self.data, self.shape, self.dtype))
 
+
 struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size)
 do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size
 
diff --git a/examples/dump_properties.py b/examples/dump_properties.py
index a960619cf706ca1a7e912636238bfedc90d57384..70c7131931c2d47a2e71da438d4847cc41aac4f9 100644
--- a/examples/dump_properties.py
+++ b/examples/dump_properties.py
@@ -1,5 +1,6 @@
-import pycuda.driver as drv
+from __future__ import annotations
 
+import pycuda.driver as drv
 
 
 drv.init()
@@ -10,10 +11,9 @@ for ordinal in range(drv.Device.count()):
     print("Device #%d: %s" % (ordinal, dev.name()))
     print("  Compute Capability: %d.%d" % dev.compute_capability())
     print("  Total Memory: %s KB" % (dev.total_memory()//(1024)))
-    atts = [(str(att), value) 
+    atts = [(str(att), value)
             for att, value in list(dev.get_attributes().items())]
     atts.sort()
-    
+
     for att, value in atts:
         print(f"  {att}: {value}")
-
diff --git a/examples/fill_gpu_with_nans.py b/examples/fill_gpu_with_nans.py
index ffa65c449e7ae1adb5749591a88a6707213785db..928b513390cbf1fe608d679b2a75a9a794bc9940 100644
--- a/examples/fill_gpu_with_nans.py
+++ b/examples/fill_gpu_with_nans.py
@@ -1,8 +1,11 @@
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-import pycuda.driver as cuda
+from __future__ import annotations
+
 import numpy
 
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+
+
 free_bytes, total_bytes = cuda.mem_get_info()
 exp = 10
 while True:
@@ -22,4 +25,3 @@ while True:
 ary.fill(float("nan"))
 
 print("filled %d out of %d bytes with NaNs" % (fill_floats*4, free_bytes))
-
diff --git a/examples/from-wiki/2d_fft.py b/examples/from-wiki/2d_fft.py
index 952d96eb24e519540ac5c2dfea0a4726019b5e28..07dd2875b5a5de04d50bc7c9c5377f9d93eea007 100644
--- a/examples/from-wiki/2d_fft.py
+++ b/examples/from-wiki/2d_fft.py
@@ -1,14 +1,14 @@
-#!python 
-import numpy
-import scipy.misc
-import numpy.fft as nfft
+#!python
+from __future__ import annotations
+
 import multiprocessing
 
+import numpy
+import scipy.misc
 from pyfft.cuda import Plan
-from pycuda.tools import make_default_context
-import pycuda.tools as pytools
-import pycuda.gpuarray as garray
+
 import pycuda.driver as drv
+import pycuda.gpuarray as garray
 
 
 class GPUMulti(multiprocessing.Process):
@@ -20,15 +20,15 @@ class GPUMulti(multiprocessing.Process):
 
     def run(self):
         drv.init()
-        a0=numpy.zeros((p,),dtype=numpy.complex64)
+        a0 = numpy.zeros((p,), dtype=numpy.complex64)
         self.dev = drv.Device(self.number)
         self.ctx = self.dev.make_context()
-#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
+# TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
         print(drv.mem_get_info())
         self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
         self.gpu_b = garray.zeros_like(self.gpu_a)
         self.gpu_a = garray.to_gpu(self.input_cpu)
-        plan = Plan(a0.shape,context=self.ctx)
+        plan = Plan(a0.shape, context=self.ctx)
         plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
         self.temp = self.gpu_b.get()
         self.output_cpu.put(self.temp)
@@ -38,56 +38,55 @@ class GPUMulti(multiprocessing.Process):
         del self.gpu_b
         del self.ctx
 
-        print("till the end %d" %self.number)
+        print("till the end %d" % self.number)
 
 
-p = 8192; # INPUT IMAGE SIZE (8192 * 8192)
+p = 8192  # INPUT IMAGE SIZE (8192 * 8192)
 m = 4     # TO DIVIDE THE INPUT IMAGE INTO 4* (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU)
-trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times
+trans = 2  # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times
 
 
-#INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
-p_n = 8000 # No. OF PERIODS OF SINE WAVES
-x=numpy.arange(0,p_n,float(p_n)/float(p))
+# INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN)
+p_n = 8000  # No. OF PERIODS OF SINE WAVES
+x = numpy.arange(0, p_n, float(p_n)/float(p))
 a_i = 128 + 128 * numpy.sin(2*numpy.pi*x)
-a2 = numpy.zeros([p,p],dtype=numpy.complex64)
-a2[::]=a_i
-scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE
+a2 = numpy.zeros([p, p], dtype=numpy.complex64)
+a2[::] = a_i
+scipy.misc.imsave("sine.bmp", numpy.absolute(a2))  # TEST THE GENERATION OF INPUT IMAGE
 
-#INITIALISE THE VARIABLES
-a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU (1d ARRAY)
-#VERY IMPORTANT
-output_cpu  = multiprocessing.Queue() #STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
+# INITIALISE THE VARIABLES
+a2_1 = numpy.zeros([m, p*p/m], dtype=numpy.complex64)  # INPUT TO THE GPU (1d ARRAY)
+# VERY IMPORTANT
+output_cpu  = multiprocessing.Queue()  # STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA)
 
-b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU
-b2_a = numpy.zeros([p,p],dtype = numpy.complex64)     #RESHAPED (8192*8192) OUTPUT
+b2pa = numpy.zeros([p/m, p, m], dtype=numpy.complex64)  # OUTPUT FROM GPU
+b2_a = numpy.zeros([p, p], dtype=numpy.complex64)     # RESHAPED (8192*8192) OUTPUT
 
-#NOW WE ARE READY TO KICK START THE GPU
+# NOW WE ARE READY TO KICK START THE GPU
 
 # THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE)
-num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
+num = 2  # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE)
 
-#THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
-for t in range (0,trans):
-    for i in range (m):
-        a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
+# THE TRANSPOSE-SPLIT ALGORITHM FOR FFT
+for _t in range(0, trans):
+    for i in range(m):
+        a2_1[i, :] = a2[i*p/m:(i+1)*p/m, :].flatten()  # DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY
 
-    for j in range (m/num):
+    for j in range(m/num):
         gpu_multi_list = []
 
-#CREATE AND START THE MULTIPROCESS
-        for i in range (num):
-            gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU
+# CREATE AND START THE MULTIPROCESS
+        for i in range(num):
+            gpu_multi = GPUMulti(i, a2_1[i+j*num, :], output_cpu)  # FEED THE DATA INTO THE GPU
             gpu_multi_list.append(gpu_multi)
-            gpu_multi.start()#THERE YOU GO
+            gpu_multi.start()  # THERE YOU GO
 
-#COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE
+# COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE
         for gpu_pro in gpu_multi_list:
             temp_b2_1 = output_cpu.get(gpu_pro)
-            b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p))
+            b2pa[:, :, gpu_pro.number+j*num] = numpy.reshape(temp_b2_1, (p/m, p))
         gpu_multi.terminate()
 
-#RESHAPE AGAIN TO (8192 * 8192) IMAGE
+# RESHAPE AGAIN TO (8192 * 8192) IMAGE
     for i in range(m):
-        b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i]
-
+        b2_a[i*p/m:(i+1)*p/m, :] = b2pa[:, :, i]
diff --git a/examples/from-wiki/2dfft.py b/examples/from-wiki/2dfft.py
deleted file mode 100644
index 67585107019452911a3233ecaecd49d8ba7492b5..0000000000000000000000000000000000000000
--- a/examples/from-wiki/2dfft.py
+++ /dev/null
@@ -1,5 +0,0 @@
-#!python 
-# Paste code for your example here.
-
-Note: please leave the '#!python' marker in place above. The script 'examples/download-examples-from-wiki.py' in the PyCUDA distribution relies on it.
-
diff --git a/examples/from-wiki/arithmetic_example.py b/examples/from-wiki/arithmetic_example.py
index 3bf5e7143df06da11b17c5319f159fe98d7fb6cb..c70023d0b51fd8442946b588e0eec2c27131ac5e 100644
--- a/examples/from-wiki/arithmetic_example.py
+++ b/examples/from-wiki/arithmetic_example.py
@@ -1,19 +1,22 @@
-#!python 
 #!python
+# !python
+from __future__ import annotations
+
+import numpy as np
+
 import pycuda.driver as cuda
-import pycuda.autoinit
-from pycuda.compiler import SourceModule
 import pycuda.gpuarray as gpuarray
-import numpy as np
+from pycuda.compiler import SourceModule
+
 
-# Converting the list into numpy array for faster access and putting it into the GPU for processing... 
+# Converting the list into numpy array for faster access and putting it into the GPU for processing...
 start = cuda.Event()
 end = cuda.Event()
 
 N = 222341
 
 values = np.random.randn(N)
-number_of_blocks=N/1024
+number_of_blocks = N/1024
 
 # Calculating the (value-max)/max-min computation and storing it in a numpy array. Pre-calculating the maximum and minimum values.
 
@@ -32,7 +35,7 @@ if (idx < N)
 }
 """, no_extern_c=1)
 
-func = func_mod.get_function('func')
+func = func_mod.get_function("func")
 x = np.asarray(values, np.float32)
 x_gpu = gpuarray.to_gpu(x)
 h_minval = np.float32(0)
@@ -40,16 +43,13 @@ h_denom = np.int32(255)
 
 start.record()
 # a function to the GPU to calculate the computation in the GPU.
-func(x_gpu.gpudata, np.uint32(N), np.float32(h_minval), np.uint32(h_denom), block=(1024, 1, 1), grid=(number_of_blocks+1,1,1))
-end.record() 
+func(x_gpu.gpudata, np.uint32(N), np.float32(h_minval), np.uint32(h_denom), block=(1024, 1, 1), grid=(number_of_blocks+1, 1, 1))
+end.record()
 end.synchronize()
 secs = start.time_till(end)*1e-3
 
 print("SourceModule time")
 print("%fs" % (secs))
-print('x:       ', x[N-1])
-print('Func(x): ', x_gpu.get()[N-1],'Actual: ',(values[N-1]-0)/(h_denom))
-x_colors=x_gpu.get()
-
-
-
+print("x:       ", x[N-1])
+print("Func(x): ", x_gpu.get()[N-1], "Actual: ", (values[N-1]-0)/(h_denom))
+x_colors = x_gpu.get()
diff --git a/examples/from-wiki/c++_function_templates.py b/examples/from-wiki/c++_function_templates.py
index b25d122d057b69eb57e2556a3ac179431fd3f857..61fcfd498274d2752eafc1e9967f721868609567 100644
--- a/examples/from-wiki/c++_function_templates.py
+++ b/examples/from-wiki/c++_function_templates.py
@@ -1,10 +1,12 @@
-#!python 
-import pycuda.gpuarray as gpuarray
-import pycuda.driver as drv
-import pycuda.autoinit
+#!python
+from __future__ import annotations
+
 import numpy as np
 
+import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
+
+
 func_mod = SourceModule("""
 template <class T>
 __device__ T incr(T x) {
@@ -23,7 +25,7 @@ extern "C" {
 }
 """, no_extern_c=1)
 
-func = func_mod.get_function('func')
+func = func_mod.get_function("func")
 
 N = 5
 x = np.asarray(np.random.rand(N), np.float32)
@@ -31,6 +33,5 @@ x_orig = x.copy()
 x_gpu = gpuarray.to_gpu(x)
 
 func(x_gpu.gpudata, np.uint32(N), block=(N, 1, 1))
-print('x:       ', x)
-print('incr(x): ', x_gpu.get())
-
+print("x:       ", x)
+print("incr(x): ", x_gpu.get())
diff --git a/examples/from-wiki/computing.py b/examples/from-wiki/computing.py
deleted file mode 100644
index 67585107019452911a3233ecaecd49d8ba7492b5..0000000000000000000000000000000000000000
--- a/examples/from-wiki/computing.py
+++ /dev/null
@@ -1,5 +0,0 @@
-#!python 
-# Paste code for your example here.
-
-Note: please leave the '#!python' marker in place above. The script 'examples/download-examples-from-wiki.py' in the PyCUDA distribution relies on it.
-
diff --git a/examples/from-wiki/convolution.py b/examples/from-wiki/convolution.py
index 09140abd4aee017b657b0765e3036c0de8e1465e..84db43a84d1e4dc73f38497e4066fc7b9f3e032d 100644
--- a/examples/from-wiki/convolution.py
+++ b/examples/from-wiki/convolution.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 '''
 /*
  * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
@@ -42,12 +42,15 @@
 
  Ported to pycuda by Andrew Wagner <awagner@illinois.edu>, June 2009.
 '''
+from __future__ import annotations
+
+import string
 
 import numpy
-import pycuda.autoinit
+
 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
-import string
+
 
 # Pull out a bunch of stuff that was hard coded as pre-processor directives used
 # by both the kernel and calling code.
@@ -58,7 +61,7 @@ ROW_TILE_W = 128
 KERNEL_RADIUS_ALIGNED = 16
 COLUMN_TILE_W = 16
 COLUMN_TILE_H = 48
-template = '''
+template = """
 //24-bit multiplication is faster on G80,
 //but we must be sure to multiply integers
 //only within [-8M, 8M - 1] range
@@ -139,20 +142,20 @@ __global__ void convolutionRowGPU(
     if(writePos <= tileEndClamped){
         const int smemPos = writePos - apronStart;
         float sum = 0;
-'''
-originalLoop = '''
+"""
+originalLoop = """
         for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
             sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k];
-'''
-unrolledLoop = ''
-for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
+"""
+unrolledLoop = ""
+for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
     loopTemplate = string.Template(
-    'sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n')
+    "sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n")
     unrolledLoop += loopTemplate.substitute(k=k)
 
-#print unrolledLoop
+# print unrolledLoop
 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
-template += '''
+template += """
         d_Result[rowStart + writePos] = sum;
         //d_Result[rowStart + writePos] = 128;
     }
@@ -212,40 +215,41 @@ __global__ void convolutionColumnGPU(
     //Calculate and output the results
     for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){
         float sum = 0;
-'''
-originalLoop = '''
+"""
+originalLoop = """
         for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
             sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] *
             d_Kernel_columns[KERNEL_RADIUS - k];
-'''
-unrolledLoop = ''
-for k in range(-KERNEL_RADIUS,  KERNEL_RADIUS+1):
-    loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n')
+"""
+unrolledLoop = ""
+for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1):
+    loopTemplate = string.Template("sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n")
     unrolledLoop += loopTemplate.substitute(k=k)
 
-#print unrolledLoop
+# print unrolledLoop
 template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop
-template += '''
+template += """
         d_Result[gmemPos] = sum;
         //d_Result[gmemPos] = 128;
         smemPos += smemStride;
         gmemPos += gmemStride;
     }
 }
-'''
+"""
 template = string.Template(template)
-code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,
-                           KERNEL_W = KERNEL_W,
+code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS,
+                           KERNEL_W=KERNEL_W,
                            COLUMN_TILE_H=COLUMN_TILE_H,
                            COLUMN_TILE_W=COLUMN_TILE_W,
                            ROW_TILE_W=ROW_TILE_W,
                            KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)
 
 module = SourceModule(code)
-convolutionRowGPU = module.get_function('convolutionRowGPU')
-convolutionColumnGPU = module.get_function('convolutionColumnGPU')
-d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
-d_Kernel_columns = module.get_global('d_Kernel_columns')[0]
+convolutionRowGPU = module.get_function("convolutionRowGPU")
+convolutionColumnGPU = module.get_function("convolutionColumnGPU")
+d_Kernel_rows = module.get_global("d_Kernel_rows")[0]
+d_Kernel_columns = module.get_global("d_Kernel_columns")[0]
+
 
 # Helper functions for computing alignment...
 def iDivUp(a, b):
@@ -254,11 +258,13 @@ def iDivUp(a, b):
     b = numpy.int32(b)
     return (a / b + 1) if (a % b != 0) else (a / b)
 
+
 def iDivDown(a, b):
     # Round a / b to nearest lower integer value
     a = numpy.int32(a)
     b = numpy.int32(b)
-    return a / b;
+    return a / b
+
 
 def iAlignUp(a, b):
     # Align a to nearest higher multiple of b
@@ -266,31 +272,34 @@ def iAlignUp(a, b):
     b = numpy.int32(b)
     return (a - a % b + b) if (a % b != 0) else a
 
+
 def iAlignDown(a, b):
     # Align a to nearest lower multiple of b
     a = numpy.int32(a)
     b = numpy.int32(b)
     return a - a % b
 
-def gaussian_kernel(width = KERNEL_W, sigma = 4.0):
-    assert width == numpy.floor(width),  'argument width should be an integer!'
+
+def gaussian_kernel(width=KERNEL_W, sigma=4.0):
+    assert width == numpy.floor(width), "argument width should be an integer!"
     radius = (width - 1)/2.0
-    x = numpy.linspace(-radius,  radius,  width)
+    x = numpy.linspace(-radius, radius, width)
     x = numpy.float32(x)
     sigma = numpy.float32(sigma)
     filterx = x*x / (2 * sigma * sigma)
     filterx = numpy.exp(-1 * filterx)
-    assert filterx.sum()>0,  'something very wrong if gaussian kernel sums to zero!'
+    assert filterx.sum() > 0, "something very wrong if gaussian kernel sums to zero!"
     filterx /= filterx.sum()
     return filterx
 
-def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
-    assert width == numpy.floor(width),  'argument width should be an integer!'
+
+def derivative_of_gaussian_kernel(width=KERNEL_W, sigma=4):
+    assert width == numpy.floor(width), "argument width should be an integer!"
     radius = (width - 1)/2.0
-    x = numpy.linspace(-radius,  radius,  width)
+    x = numpy.linspace(-radius, radius, width)
     x = numpy.float32(x)
     # The derivative of a gaussian is really just a gaussian times x, up to scale.
-    filterx = gaussian_kernel(width,  sigma)
+    filterx = gaussian_kernel(width, sigma)
     filterx *= x
     # Rescale so that filter returns derivative of 1 when applied to x:
     scale = (x * filterx).sum()
@@ -299,43 +308,45 @@ def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4):
     filterx *= -1.0
     return filterx
 
+
 def test_derivative_of_gaussian_kernel():
     width = 20
     sigma = 10.0
-    filterx = derivative_of_gaussian_kernel(width,  sigma)
+    filterx = derivative_of_gaussian_kernel(width, sigma)
     x = 2 * numpy.arange(0, width)
     x = numpy.float32(x)
     response = (filter * x).sum()
-    assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
+    assert abs(response - (-2.0)) < .0001, "derivative of gaussian failed scale test!"
     width = 19
     sigma = 10.0
-    filterx = derivative_of_gaussian_kernel(width,  sigma)
+    filterx = derivative_of_gaussian_kernel(width, sigma)
     x = 2 * numpy.arange(0, width)
     x = numpy.float32(x)
     response = (filterx * x).sum()
-    assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!'
+    assert abs(response - (-2.0)) < .0001, "derivative of gaussian failed scale test!"
+
 
-def convolution_cuda(sourceImage,  filterx,  filtery):
+def convolution_cuda(sourceImage, filterx, filtery):
     # Perform separable convolution on sourceImage using CUDA.
     # Operates on floating point images with row-major storage.
     destImage = sourceImage.copy()
-    assert sourceImage.dtype == 'float32',  'source image must be float32'
-    (imageHeight,  imageWidth) = sourceImage.shape
-    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
+    assert sourceImage.dtype == "float32", "source image must be float32"
+    (imageHeight, imageWidth) = sourceImage.shape
+    assert filterx.shape == filtery.shape == (KERNEL_W, ), "Kernel is compiled for a different kernel size! Try changing KERNEL_W"
     filterx = numpy.float32(filterx)
     filtery = numpy.float32(filtery)
-    DATA_W = iAlignUp(imageWidth, 16);
-    DATA_H = imageHeight;
-    BYTES_PER_WORD = 4;  # 4 for float32
-    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD;
-    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD;
+    DATA_W = iAlignUp(imageWidth, 16)
+    DATA_H = imageHeight
+    BYTES_PER_WORD = 4  # 4 for float32
+    DATA_W * DATA_H * BYTES_PER_WORD
+    KERNEL_W * BYTES_PER_WORD
     # Prepare device arrays
     destImage_gpu = cuda.mem_alloc_like(destImage)
     sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
     intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
     cuda.memcpy_htod(sourceImage_gpu, sourceImage)
-    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
-    cuda.memcpy_htod(d_Kernel_columns,  filtery)
+    cuda.memcpy_htod(d_Kernel_rows, filterx)  # The kernel goes into constant memory via a symbol defined in the kernel
+    cuda.memcpy_htod(d_Kernel_columns, filtery)
     # Call the kernels for convolution in each direction.
     blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
     blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
@@ -343,32 +354,33 @@ def convolution_cuda(sourceImage,  filterx,  filtery):
     threadBlockColumns = (COLUMN_TILE_W, 8, 1)
     DATA_H = numpy.int32(DATA_H)
     DATA_W = numpy.int32(DATA_W)
-    grid_rows = tuple([int(e) for e in blockGridRows])
-    block_rows = tuple([int(e) for e in threadBlockRows])
-    grid_cols = tuple([int(e) for e in blockGridColumns])
-    block_cols = tuple([int(e) for e in threadBlockColumns])
-    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=grid_rows,  block=block_rows)
-    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=grid_cols,  block=block_cols)
+    grid_rows = tuple(int(e) for e in blockGridRows)
+    block_rows = tuple(int(e) for e in threadBlockRows)
+    grid_cols = tuple(int(e) for e in blockGridColumns)
+    block_cols = tuple(int(e) for e in threadBlockColumns)
+    convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=grid_rows, block=block_rows)
+    convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=grid_cols, block=block_cols)
 
     # Pull the data back from the GPU.
-    cuda.memcpy_dtoh(destImage,  destImage_gpu)
+    cuda.memcpy_dtoh(destImage, destImage_gpu)
     return destImage
 
+
 def test_convolution_cuda():
     # Test the convolution kernel.
     # Generate or load a test image
-    original = numpy.random.rand(768,  1024) * 255
+    original = numpy.random.rand(768, 1024) * 255
     original = numpy.float32(original)
     # You probably want to display the image using the tool of your choice here.
     filterx = gaussian_kernel()
     destImage = original.copy()
     destImage[:] = numpy.nan
-    destImage = convolution_cuda(original,  filterx,  filterx)
+    destImage = convolution_cuda(original, filterx, filterx)
     # You probably want to display the result image using the tool of your choice here.
-    print('Done running the convolution kernel!')
+    print("Done running the convolution kernel!")
 
-if __name__ == '__main__':
-    test_convolution_cuda()
-    #test_derivative_of_gaussian_kernel()
-    boo = input('Pausing so you can look at results... <Enter> to finish...')
 
+if __name__ == "__main__":
+    test_convolution_cuda()
+    # test_derivative_of_gaussian_kernel()
+    boo = input("Pausing so you can look at results... <Enter> to finish...")
diff --git a/examples/from-wiki/demo_complex.py b/examples/from-wiki/demo_complex.py
index a89a36b9bd0be3c2f705d14353d0b56743ab98a0..a704f4f1aae8392e4d98af4ef0ec421dbb047fda 100644
--- a/examples/from-wiki/demo_complex.py
+++ b/examples/from-wiki/demo_complex.py
@@ -1,9 +1,10 @@
-#!python 
-import pycuda.driver as cuda
-import pycuda.gpuarray as gpuarray
-import pycuda.autoinit
+#!python
+from __future__ import annotations
+
 import numpy
-from pycuda.curandom import rand as curand
+
+import pycuda.gpuarray as gpuarray
+
 
 a = (numpy.random.randn(400)
         +1j*numpy.random.randn(400)).astype(numpy.complex64)
@@ -14,6 +15,8 @@ a_gpu = gpuarray.to_gpu(a)
 b_gpu = gpuarray.to_gpu(b)
 
 from pycuda.elementwise import ElementwiseKernel
+
+
 complex_mul = ElementwiseKernel(
         "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z",
         "z[i] = x[i] * y[i]",
@@ -24,8 +27,8 @@ c_gpu = gpuarray.empty_like(a_gpu)
 complex_mul(a_gpu, b_gpu, c_gpu)
 
 import numpy.linalg as la
+
+
 error = la.norm(c_gpu.get() - (a*b))
 print(error)
 assert error < 1e-5
-
-
diff --git a/examples/from-wiki/demo_meta_cgen.py b/examples/from-wiki/demo_meta_cgen.py
index 51e1d737da1f0d46e935464e735146aa0fb6fef6..2a44f83c6418bb75e4352b12a826964da16535cc 100644
--- a/examples/from-wiki/demo_meta_cgen.py
+++ b/examples/from-wiki/demo_meta_cgen.py
@@ -1,10 +1,13 @@
-#!python 
-import pycuda.driver as cuda
-import pycuda.autoinit
+#!python
+from __future__ import annotations
+
 import numpy
 import numpy.linalg as la
+
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
+
 thread_strides = 16
 block_size = 256
 macroblock_count = 33
@@ -19,40 +22,46 @@ a_gpu = cuda.to_device(a)
 b_gpu = cuda.to_device(b)
 c_gpu = cuda.mem_alloc(a.nbytes)
 
-from cgen import FunctionBody, \
-        FunctionDeclaration, Typedef, POD, Value, \
-        Pointer, Module, Block, Initializer, Assign
+from cgen import (
+    POD,
+    Assign,
+    Block,
+    FunctionBody,
+    FunctionDeclaration,
+    Initializer,
+    Module,
+    Pointer,
+    Value,
+)
 from cgen.cuda import CudaGlobal
 
+
 mod = Module([
     FunctionBody(
         CudaGlobal(FunctionDeclaration(
             Value("void", "add"),
-            arg_decls=[Pointer(POD(dtype, name)) 
+            arg_decls=[Pointer(POD(dtype, name))
                 for name in ["tgt", "op1", "op2"]])),
         Block([
             Initializer(
                 POD(numpy.int32, "idx"),
-                "threadIdx.x + %d*blockIdx.x" 
+                "threadIdx.x + %d*blockIdx.x"
                 % (block_size*thread_strides)),
             ]+[
             Assign(
                 "tgt[idx+%d]" % (o*block_size),
                 "op1[idx+%d] + op2[idx+%d]" % (
-                    o*block_size, 
+                    o*block_size,
                     o*block_size))
             for o in range(thread_strides)]))])
 
 mod = SourceModule(mod)
 
 func = mod.get_function("add")
-func(c_gpu, a_gpu, b_gpu, 
-        block=(block_size,1,1),
-        grid=(macroblock_count,1))
+func(c_gpu, a_gpu, b_gpu,
+        block=(block_size, 1, 1),
+        grid=(macroblock_count, 1))
 
 c = cuda.from_device_like(c_gpu, a)
 
 assert la.norm(c-(a+b)) == 0
-
-
-
diff --git a/examples/from-wiki/demo_meta_matrixmul_cheetah.py b/examples/from-wiki/demo_meta_matrixmul_cheetah.py
index 196edbbe3aca07f7d72f3d214feefc94bd60288f..ef80a627c3c1398b0bdeb213beef0428a767d8ec 100644
--- a/examples/from-wiki/demo_meta_matrixmul_cheetah.py
+++ b/examples/from-wiki/demo_meta_matrixmul_cheetah.py
@@ -1,20 +1,22 @@
-#!python 
-#!/usr/bin/env python
+#!python
+# !/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 
-""" 
-PyCuda Optimized Matrix Multiplication 
+"""
+PyCuda Optimized Matrix Multiplication
 Template Meta-programming Example using Cheetah
 (modified from SciPy09 Advanced Tutorial)
 """
 
 # ------------------------------------------------------------------------------
+from __future__ import annotations
+
 import numpy as np
-from pycuda import driver, compiler, gpuarray, tools
 from Cheetah.Template import Template
 
-import pycuda.autoinit
+from pycuda import compiler, gpuarray
+
 
 # -- default parameters
 DEFAULT_BLOCK_SIZE = 16
@@ -24,29 +26,32 @@ DEFAULT_SPILL = False
 DEFAULT_PREFETCH = False
 
 from os import path
+
+
 MYPATH = path.dirname(path.abspath(__file__))
 TEMPLATE_FILENAME = path.join(MYPATH, "demo_meta_matrixmul_cheetah.template.cu")
 
+
 # ------------------------------------------------------------------------------
-def matrixmul_opt(mat_a, mat_b, 
-                  block_size = DEFAULT_BLOCK_SIZE,
-                  work_size = DEFAULT_WORK_SIZE,
-                  unroll = DEFAULT_UNROLL,
-                  spill = DEFAULT_SPILL,
-                  prefetch = DEFAULT_PREFETCH):
-    
+def matrixmul_opt(mat_a, mat_b,
+                  block_size=DEFAULT_BLOCK_SIZE,
+                  work_size=DEFAULT_WORK_SIZE,
+                  unroll=DEFAULT_UNROLL,
+                  spill=DEFAULT_SPILL,
+                  prefetch=DEFAULT_PREFETCH):
+
     ah, aw = mat_a.shape
     bh, bw = mat_b.shape
-    
+
     assert aw == bh
 
     # -- pad input matrices appropriately
     ah_padded = int(np.ceil(ah/block_size)) * block_size
     aw_padded = int(np.ceil(aw/block_size)) * (block_size*work_size)
     mat_a_padded = np.zeros((ah_padded, aw_padded), np.float32)
-    mat_a_padded[:ah,:aw] = mat_a
+    mat_a_padded[:ah, :aw] = mat_a
 
-    bh_padded = aw_padded 
+    bh_padded = aw_padded
     bw_padded = int(np.ceil(bw/(block_size*work_size))) * (block_size*work_size)
     mat_b_padded = np.zeros((bh_padded, bw_padded), np.float32)
     mat_b_padded[:bh, :bw] = mat_b
@@ -55,7 +60,7 @@ def matrixmul_opt(mat_a, mat_b,
     cw_padded = bw_padded
 
     # -- upload padded input matrices to the GPU
-    mat_a_gpu = gpuarray.to_gpu(mat_a_padded) 
+    mat_a_gpu = gpuarray.to_gpu(mat_a_padded)
     mat_b_gpu = gpuarray.to_gpu(mat_b_padded)
 
     # -- create empty container matrix for the result (C = A * B)
@@ -63,26 +68,26 @@ def matrixmul_opt(mat_a, mat_b,
 
     # -- generate and compile the code
     # prepare the template parameters
-    template_params = { 
-        'BLOCK_SIZE': block_size, 
-        'WORK_SIZE': work_size, 
-        'UNROLL': unroll, 
-        'SPILL': spill, 
-        'PREFETCH': prefetch, 
-        'A_WIDTH': aw_padded, 
-        'A_HEIGHT': ah_padded, 
-        'B_WIDTH': bw_padded,
+    template_params = {
+        "BLOCK_SIZE": block_size,
+        "WORK_SIZE": work_size,
+        "UNROLL": unroll,
+        "SPILL": spill,
+        "PREFETCH": prefetch,
+        "A_WIDTH": aw_padded,
+        "A_HEIGHT": ah_padded,
+        "B_WIDTH": bw_padded,
         }
-    
+
     # run the template engine to get the code
     kernel_code = Template(
-        file = TEMPLATE_FILENAME, 
-        searchList = [template_params],
+        file=TEMPLATE_FILENAME,
+        searchList=[template_params],
         )
-    
+
     # compile the code
     module = compiler.SourceModule(kernel_code)
-    
+
     # get the kernel from the module
     matrixmul_func = module.get_function("matrixMul")
 
@@ -90,30 +95,30 @@ def matrixmul_opt(mat_a, mat_b,
     print("number of registers used:", matrixmul_func.num_regs)
 
     # block of threads
-    # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z) 
+    # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z)
     #            and not (threadDim.z, threadDim.y, threadDim.x)
-    block =  block_size, block_size, 1
-    
-    # grid of blocks 
-    # ATTENTION: it's (blockDim.x, blockDim.y) 
+    block = block_size, block_size, 1
+
+    # grid of blocks
+    # ATTENTION: it's (blockDim.x, blockDim.y)
     #            and not (blockDim.y, blockDim.x)
     grid = int(cw_padded/block_size/work_size), int(ch_padded/block_size)
 
     # -- call the kernel on the GPU
-    # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel 
+    # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel
     # to make sure that the timing is correct. If you time the code yourself, you'll have to
     # synchronize the current Context.
     gpu_time = matrixmul_func(
         # -- output
-        mat_c_gpu, 
+        mat_c_gpu,
         # -- inputs
-        mat_a_gpu, mat_b_gpu, 
+        mat_a_gpu, mat_b_gpu,
         # -- grid of blocks
-        grid = grid, 
+        grid=grid,
         # -- block of threads
-        block = block, 
+        block=block,
         # -- time the kernel (approx.)
-        time_kernel = True,
+        time_kernel=True,
         )
 
     # get the GPU matrix back to CPU memory
@@ -122,20 +127,21 @@ def matrixmul_opt(mat_a, mat_b,
 
     return mat_c, gpu_time
 
+
 # ------------------------------------------------------------------------------
-if __name__ == "__main__": 
+if __name__ == "__main__":
 
     # matrix sizes
     a_height = 1024
     a_width = 1024
     b_height = a_width
     b_width = 1024
-    
+
     # create random square matrices
     np.random.seed(0)
     mat_a = np.random.randn(a_height, a_width).astype(np.float32)
     mat_b = np.random.randn(b_height, b_width).astype(np.float32)
-    
+
     # compute reference on the cpu to verify GPU computation
     mat_ref = np.dot(mat_a, mat_b)
 
@@ -155,5 +161,3 @@ if __name__ == "__main__":
     gflop = mat_c.size * (a_width * 2.) / (1000**3.)
     gflops = gflop / gpu_time
     print("gflops:", gflops)
-
-
diff --git a/examples/from-wiki/distance_element_wise3d.py b/examples/from-wiki/distance_element_wise3d.py
index 240b32e9484d54848c39d6f929553b17e69c1872..64a15f74019050803e03567aae955bf3788e3b27 100644
--- a/examples/from-wiki/distance_element_wise3d.py
+++ b/examples/from-wiki/distance_element_wise3d.py
@@ -1,12 +1,15 @@
-#!python 
-import pycuda.gpuarray as gpuarray
+#!python
+from __future__ import annotations
+
+import random
+
+import numpy
+
 import pycuda.driver as cuda
-import pycuda.tools as tools
-import pycuda.autoinit
-import numpy, random, time
-from pycuda.curandom import rand as curand
+import pycuda.gpuarray as gpuarray
 from pycuda.elementwise import ElementwiseKernel as Elementwise
 
+
 x = 50
 y = 50
 z = 2
@@ -14,6 +17,7 @@ width = 100
 height = 100
 depth = 100
 
+
 def main():
     """
     Computes a set of distances from a given point in a search space in parallel on a GPU.
@@ -50,10 +54,9 @@ def main():
                                  random.uniform(-width, width)), n, r)
     end.record()
     end.synchronize()
-    print((start.time_till(end)))
+    print(start.time_till(end))
     print(r)
 
-if __name__ == '__main__':
-    main()
-
 
+if __name__ == "__main__":
+    main()
diff --git a/examples/from-wiki/game_of_life.py b/examples/from-wiki/game_of_life.py
index 60564c3287c8fc723bc5756ad99ebd53ac384125..8663b216365bd902695b08ac2a8c29c95b483551 100644
--- a/examples/from-wiki/game_of_life.py
+++ b/examples/from-wiki/game_of_life.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 # Conway's Game of Life Accelerated with PyCUDA
 # Luis Villasenor
 # lvillasen@gmail.com
@@ -6,27 +6,34 @@
 # Licence: GPLv3
 # Usage: python GameOfLife.py n n_iter
 # where n is the board size and n_iter the number of iterations
-import pycuda.driver as cuda
-import pycuda.tools
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-from pycuda.compiler import SourceModule
+from __future__ import annotations
+
 import sys
+
+import matplotlib.pyplot as plt
 import numpy as np
 from pylab import cm as cm
-import matplotlib.pyplot as plt
-n=int(sys.argv[1])
-n_iter=int(sys.argv[2])
-n_block=16
-n_grid=int(n/n_block);
-n=n_block*n_grid;
+
+import pycuda.gpuarray as gpuarray
+from pycuda.compiler import SourceModule
+
+
+n = int(sys.argv[1])
+n_iter = int(sys.argv[2])
+n_block = 16
+n_grid = int(n/n_block)
+n = n_block*n_grid
+
+
 def random_init(n):
-    #np.random.seed(100)
-    M=np.zeros((n,n)).astype(np.int32)
+    # np.random.seed(100)
+    M = np.zeros((n, n)).astype(np.int32)
     for i in range(n):
         for j in range(n):
-            M[j,i]=np.int32(np.random.randint(2))
+            M[j, i] = np.int32(np.random.randint(2))
     return M
+
+
 mod = SourceModule("""
 __global__ void step(int *C, int *M)
 {
@@ -54,28 +61,27 @@ __global__ void step(int *C, int *M)
 }
 """)
 func = mod.get_function("step")
-C=random_init(n)
+C = random_init(n)
 M = np.empty_like(C)
-C_gpu = gpuarray.to_gpu( C )
-M_gpu = gpuarray.to_gpu( M )
-for k in range(n_iter):
-  func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
+C_gpu = gpuarray.to_gpu(C)
+M_gpu = gpuarray.to_gpu(M)
+for _k in range(n_iter):
+  func(C_gpu, M_gpu, block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
   C_gpu, M_gpu = M_gpu, C_gpu
-print(("%d live cells after %d iterations" %(np.sum(C_gpu.get()),n_iter)))
-fig = plt.figure(figsize=(12,12))
+print("%d live cells after %d iterations" % (np.sum(C_gpu.get()), n_iter))
+fig = plt.figure(figsize=(12, 12))
 ax = fig.add_subplot(111)
 fig.suptitle("Conway's Game of Life Accelerated with PyCUDA")
-ax.set_title('Number of Iterations = %d'%(n_iter))
-myobj =plt.imshow(C_gpu.get(),origin='lower',cmap='Greys',  interpolation='nearest',vmin=0, vmax=1)
+ax.set_title("Number of Iterations = %d" % (n_iter))
+myobj = plt.imshow(C_gpu.get(), origin="lower", cmap="Greys", interpolation="nearest", vmin=0, vmax=1)
 plt.pause(.01)
 plt.draw()
-m=n_iter
+m = n_iter
 while True:
-    m+=1
-    func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
+    m += 1
+    func(C_gpu, M_gpu, block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
     C_gpu, M_gpu = M_gpu, C_gpu
     myobj.set_data(C_gpu.get())
-    ax.set_title('Number of Iterations = %d'%(m))
+    ax.set_title("Number of Iterations = %d" % (m))
     plt.pause(.01)
     plt.draw()
-
diff --git a/examples/from-wiki/gl_interop.py b/examples/from-wiki/gl_interop.py
index 77249e51f7f6daca369f6d1b8d867feedc694f92..37268afec6e9fa8cd0fc7a018bc2d82a674928de 100644
--- a/examples/from-wiki/gl_interop.py
+++ b/examples/from-wiki/gl_interop.py
@@ -1,23 +1,27 @@
-#!python 
+#!python
 # GL interoperability example, by Peter Berrington.
 # Draws a rotating teapot, using cuda to invert the RGB value
 # each frame
+from __future__ import annotations
 
+import sys
+import time
+
+import numpy
 from OpenGL.GL import *
-from OpenGL.GLUT import *
-from OpenGL.GLU import *
-from OpenGL.GL.ARB.vertex_buffer_object import *
 from OpenGL.GL.ARB.pixel_buffer_object import *
+from OpenGL.GL.ARB.vertex_buffer_object import *
+from OpenGL.GLU import *
+from OpenGL.GLUT import *
 
-
-import numpy, sys, time
 import pycuda.driver as cuda_driver
 import pycuda.gl as cuda_gl
 from pycuda.compiler import SourceModule
 
-#this is all munged together from the CUDA SDK postprocessGL example.
 
-initial_size = 512,512
+# this is all munged together from the CUDA SDK postprocessGL example.
+
+initial_size = 512, 512
 current_size = initial_size
 animate = True
 enable_cuda = True
@@ -26,15 +30,16 @@ time_of_last_draw = 0.0
 time_of_last_titleupdate = 0.0
 frames_per_second = 0.0
 frame_counter = 0
-output_texture = None # pointer to offscreen render target
+output_texture = None  # pointer to offscreen render target
 (source_pbo, dest_pbo, cuda_module, invert,
  pycuda_source_pbo, pycuda_dest_pbo) = [None]*6
-heading,pitch,bank = [0.0]*3
+heading, pitch, bank = [0.0]*3
+
 
-def create_PBOs(w,h):
+def create_PBOs(w, h):
     global source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo
     num_texels = w*h
-    data = numpy.zeros((num_texels,4),numpy.uint8)
+    data = numpy.zeros((num_texels, 4), numpy.uint8)
     source_pbo = glGenBuffers(1)
     glBindBuffer(GL_ARRAY_BUFFER, source_pbo)
     glBufferData(GL_ARRAY_BUFFER, data, GL_DYNAMIC_DRAW)
@@ -46,15 +51,17 @@ def create_PBOs(w,h):
     glBindBuffer(GL_ARRAY_BUFFER, 0)
     pycuda_dest_pbo = cuda_gl.BufferObject(int(dest_pbo))
 
+
 def destroy_PBOs():
     global source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo
     for pbo in [source_pbo, dest_pbo]:
         glBindBuffer(GL_ARRAY_BUFFER, int(pbo))
         glDeleteBuffers(1, int(pbo))
         glBindBuffer(GL_ARRAY_BUFFER, 0)
-    source_pbo,dest_pbo,pycuda_source_pbo,pycuda_dest_pbo = [None]*4
+    source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo = [None]*4
+
 
-def create_texture(w,h):
+def create_texture(w, h):
     global output_texture
     output_texture = glGenTextures(1)
     glBindTexture(GL_TEXTURE_2D, output_texture)
@@ -67,11 +74,13 @@ def create_texture(w,h):
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA,
                  w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, None)
 
+
 def destroy_texture():
     global output_texture
     glDeleteTextures(output_texture)
     output_texture = None
 
+
 def init_gl():
     Width, Height = current_size
     glClearColor(0.1, 0.1, 0.5, 1.0)
@@ -82,11 +91,12 @@ def init_gl():
     gluPerspective(60.0, Width/float(Height), 0.1, 10.0)
     glPolygonMode(GL_FRONT_AND_BACK, GL_FILL)
     glEnable(GL_LIGHT0)
-    red   = ( 1.0, 0.1, 0.1, 1.0 )
-    white = ( 1.0, 1.0, 1.0, 1.0 )
-    glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE,  red  )
+    red   = (1.0, 0.1, 0.1, 1.0)
+    white = (1.0, 1.0, 1.0, 1.0)
+    glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red)
     glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white)
-    glMaterialf( GL_FRONT_AND_BACK, GL_SHININESS, 60.0)
+    glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0)
+
 
 def resize(Width, Height):
     global current_size
@@ -96,32 +106,35 @@ def resize(Width, Height):
     glLoadIdentity()
     gluPerspective(60.0, Width/float(Height), 0.1, 10.0)
 
+
 def do_tick():
     global time_of_last_titleupdate, frame_counter, frames_per_second
-    if ((time.clock () * 1000.0) - time_of_last_titleupdate >= 1000.):
+    if ((time.clock() * 1000.0) - time_of_last_titleupdate >= 1000.):
         frames_per_second = frame_counter                   # Save The FPS
         frame_counter = 0  # Reset The FPS Counter
-        szTitle = "%d FPS" % (frames_per_second )
-        glutSetWindowTitle ( szTitle )
-        time_of_last_titleupdate = time.clock () * 1000.0
+        szTitle = "%d FPS" % (frames_per_second)
+        glutSetWindowTitle(szTitle)
+        time_of_last_titleupdate = time.clock() * 1000.0
     frame_counter += 1
 
+
 # The function called whenever a key is pressed. Note the use of Python tuples to pass in: (key, x, y)
 def keyPressed(*args):
     global animate, enable_cuda
     # If escape is pressed, kill everything.
-    if args[0] == '\033':
-        print('Closing..')
+    if args[0] == "\033":
+        print("Closing..")
         destroy_PBOs()
         destroy_texture()
         exit()
-    elif args[0] == 'a':
-        print('toggling animation')
+    elif args[0] == "a":
+        print("toggling animation")
         animate = not animate
-    elif args[0] == 'e':
-        print('toggling cuda')
+    elif args[0] == "e":
+        print("toggling cuda")
         enable_cuda = not enable_cuda
 
+
 def idle():
     global heading, pitch, bank
     if animate:
@@ -131,6 +144,7 @@ def idle():
 
     glutPostRedisplay()
 
+
 def display():
     try:
         render_scene()
@@ -144,9 +158,10 @@ def display():
         from os import _exit
         _exit(0)
 
+
 def process(width, height):
     """ Use PyCuda """
-    grid_dimensions   = (width//16,height//16)
+    grid_dimensions   = (width//16, height//16)
 
     source_mapping = pycuda_source_pbo.map()
     dest_mapping   = pycuda_dest_pbo.map()
@@ -160,9 +175,10 @@ def process(width, height):
     source_mapping.unmap()
     dest_mapping.unmap()
 
+
 def process_image():
     """ copy image and process using CUDA """
-    global pycuda_source_pbo,source_pbo,current_size, dest_pbo
+    global pycuda_source_pbo, source_pbo, current_size, dest_pbo
     image_width, image_height = current_size
     assert source_pbo is not None
 
@@ -174,12 +190,12 @@ def process_image():
 
     # read data into pbo. note: use BGRA format for optimal performance
     glReadPixels(
-             0,                  #start x
-             0,                  #start y
-             image_width,        #end   x
-             image_height,       #end   y
-             GL_BGRA,            #format
-             GL_UNSIGNED_BYTE,   #output type
+             0,                  # start x
+             0,                  # start y
+             image_width,        # end   x
+             image_height,       # end   y
+             GL_BGRA,            # format
+             GL_UNSIGNED_BYTE,   # output type
              ctypes.c_void_p(0))
 
     pycuda_source_pbo = cuda_gl.BufferObject(int(source_pbo))
@@ -195,6 +211,7 @@ def process_image():
                     image_width, image_height,
                     GL_BGRA, GL_UNSIGNED_BYTE, ctypes.c_void_p(0))
 
+
 def display_image():
     """ render a screen sized quad """
     glDisable(GL_DEPTH_TEST)
@@ -205,7 +222,7 @@ def display_image():
     glPushMatrix()
     glLoadIdentity()
     glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0)
-    glMatrixMode( GL_MODELVIEW)
+    glMatrixMode(GL_MODELVIEW)
     glLoadIdentity()
     glViewport(0, 0, current_size[0], current_size[1])
     glBegin(GL_QUADS)
@@ -226,21 +243,22 @@ def display_image():
 
 
 def render_scene():
-    glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)# Clear Screen And Depth Buffer
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)  # Clear Screen And Depth Buffer
     glMatrixMode(GL_MODELVIEW)
-    glLoadIdentity ()      # Reset The Modelview Matrix
+    glLoadIdentity()      # Reset The Modelview Matrix
     glTranslatef(0.0, 0.0, -3.0)
     glRotatef(heading, 1.0, 0.0, 0.0)
-    glRotatef(pitch  , 0.0, 1.0, 0.0)
-    glRotatef(bank   , 0.0, 0.0, 1.0)
-    glViewport(0, 0, current_size[0],current_size[1])
+    glRotatef(pitch, 0.0, 1.0, 0.0)
+    glRotatef(bank, 0.0, 0.0, 1.0)
+    glViewport(0, 0, current_size[0], current_size[1])
     glEnable(GL_LIGHTING)
     glEnable(GL_DEPTH_TEST)
     glDepthFunc(GL_LESS)
     glutSolidTeapot(1.0)
-    do_tick()#just for fps display..
+    do_tick()  # just for fps display..
     return True
 
+
 def main():
     global window, cuda_module, cuda_gl, cuda_driver, invert
     glutInit(sys.argv)
@@ -258,9 +276,9 @@ def main():
     # create texture for blitting to screen
     create_texture(*initial_size)
 
-    #setup pycuda gl interop
-    import pycuda.gl.autoinit
+    # setup pycuda gl interop
     import pycuda.gl
+    import pycuda.gl.autoinit
     cuda_gl = pycuda.gl
     cuda_driver = pycuda.driver
 
@@ -287,8 +305,8 @@ def main():
 
     glutMainLoop()
 
+
 # Print message to console, and kick off the main to get it rolling.
 if __name__ == "__main__":
     print("Hit ESC key to quit, 'a' to toggle animation, and 'e' to toggle cuda")
     main()
-
diff --git a/examples/from-wiki/gpu_scalar_mult.py b/examples/from-wiki/gpu_scalar_mult.py
index 5d343db159cfe7f82dea518bdb1ba4044de62452..11e133cdde98d6f6c1efe418d4ec19935fc21308 100644
--- a/examples/from-wiki/gpu_scalar_mult.py
+++ b/examples/from-wiki/gpu_scalar_mult.py
@@ -1,13 +1,13 @@
-#!python 
+#!python
+from __future__ import annotations
+
 import numpy
-import pycuda.autoinit
+
 import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
 from pycuda.tools import context_dependent_memoize
 
 
-
-
 def main(dtype):
     from pycuda.elementwise import get_linear_combination_kernel
     lc_kernel, lc_texrefs = get_linear_combination_kernel((
@@ -30,7 +30,7 @@ def main(dtype):
         stop = drv.Event()
         start.record()
 
-        for i in range(20):
+        for _i in range(20):
             a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
             b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
             lc_kernel.prepared_call(x._grid, x._block,
@@ -42,13 +42,11 @@ def main(dtype):
         print(size, size_exp, stop.time_since(start))
 
 
-
 @context_dependent_memoize
 def get_lin_comb_kernel_no_tex(summand_descriptors,
         dtype_z):
+    from pycuda.elementwise import ScalarArg, VectorArg, get_elwise_module
     from pycuda.tools import dtype_to_ctype
-    from pycuda.elementwise import \
-            VectorArg, ScalarArg, get_elwise_module
 
     args = []
     loop_prep = []
@@ -81,14 +79,13 @@ def get_lin_comb_kernel_no_tex(summand_descriptors,
     return func
 
 
-
 def main_no_tex(dtype):
     lc_kernel = get_lin_comb_kernel_no_tex((
         (True, dtype, dtype),
         (True, dtype, dtype)
         ), dtype)
 
-    for size_exp in range(10,26):
+    for size_exp in range(10, 26):
         size = 1 << size_exp
 
         from pycuda.curandom import rand
@@ -103,7 +100,7 @@ def main_no_tex(dtype):
         stop = drv.Event()
         start.record()
 
-        for i in range(20):
+        for _i in range(20):
             lc_kernel.prepared_call(x._grid, x._block,
                 a.gpudata, x.gpudata,
                 b.gpudata, y.gpudata,
@@ -115,12 +112,9 @@ def main_no_tex(dtype):
         print(size, size_exp, stop.time_since(start))
 
 
-
-
 if __name__ == "__main__":
     dtype = numpy.float32
 
     main(dtype)
     print()
     main_no_tex(dtype)
-
diff --git a/examples/from-wiki/kernel_concurrency.py b/examples/from-wiki/kernel_concurrency.py
index 8b04a19fe44e000b2683932f7bda6e5c34aad21a..958675fd70f760b633636aa207eb1faa172c95ec 100644
--- a/examples/from-wiki/kernel_concurrency.py
+++ b/examples/from-wiki/kernel_concurrency.py
@@ -1,17 +1,20 @@
-#!python 
-#! /usr/bin/env python
+#!python
+# ! /usr/bin/env python
 # A simple program to illustrate kernel concurrency with PyCuda.
 # Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2.
 # Jesse Lu, 2011-04-04
+from __future__ import annotations
 
 import numpy as np
+
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda.compiler import SourceModule
 
+
 #
 # Set up test scenario.
-# 
+#
 
 # Create a simple test kernel.
 mod = SourceModule("""
@@ -29,13 +32,13 @@ __global__ void my_kernel(float *d) {
 my_kernel = mod.get_function("my_kernel")
 
 # Create the test data on the host.
-N = 400 # Size of datasets.
-n = 2 # Number of datasets (and concurrent operations) used.
+N = 400  # Size of datasets.
+n = 2  # Number of datasets (and concurrent operations) used.
 data, data_check, d_data = [], [], []
 for k in range(n):
-    data.append(np.random.randn(N).astype(np.float32)) # Create random data.
-    data_check.append(data[k].copy()) # For checking the result afterwards. 
-    d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device.
+    data.append(np.random.randn(N).astype(np.float32))  # Create random data.
+    data_check.append(data[k].copy())  # For checking the result afterwards.
+    d_data.append(drv.mem_alloc(data[k].nbytes))  # Allocate memory on device.
 
 #
 # Start concurrency test.
@@ -47,49 +50,48 @@ ref.record()
 
 # Create the streams and events needed.
 stream, event = [], []
-marker_names = ['kernel_begin', 'kernel_end']
+marker_names = ["kernel_begin", "kernel_end"]
 for k in range(n):
     stream.append(drv.Stream())
     event.append({marker_names[l]: drv.Event() for l in range(len(marker_names))})
 
 # Transfer to device.
 for k in range(n):
-    drv.memcpy_htod(d_data[k], data[k]) 
+    drv.memcpy_htod(d_data[k], data[k])
 
 # Run kernels many times, we will only keep data from last loop iteration.
 for j in range(10):
     for k in range(n):
-        event[k]['kernel_begin'].record(stream[k])
-        my_kernel(d_data[k], block=(N,1,1), stream=stream[k]) 
-    for k in range(n): # Commenting out this line should break concurrency.
-        event[k]['kernel_end'].record(stream[k])
+        event[k]["kernel_begin"].record(stream[k])
+        my_kernel(d_data[k], block=(N, 1, 1), stream=stream[k])
+    for k in range(n):  # Commenting out this line should break concurrency.
+        event[k]["kernel_end"].record(stream[k])
 
 # Transfer data back to host.
 for k in range(n):
-    drv.memcpy_dtoh(data[k], d_data[k]) 
+    drv.memcpy_dtoh(data[k], d_data[k])
 
-# 
+#
 # Output results.
 #
 
-print('\n=== Device attributes')
+print("\n=== Device attributes")
 dev = pycuda.autoinit.device
-print(('Name:', dev.name()))
-print(('Compute capability:', dev.compute_capability()))
-print(('Concurrent Kernels:', \
+print(("Name:", dev.name()))
+print(("Compute capability:", dev.compute_capability()))
+print(("Concurrent Kernels:",
     bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS))))
 
-print('\n=== Checking answers')
+print("\n=== Checking answers")
 for k in range(n):
-    print(('Dataset', k, ':',))
+    print(("Dataset", k, ":",))
     if (np.linalg.norm((data_check[k] * 2**(j+1)) - data[k]) == 0.0):
-        print('passed.')
+        print("passed.")
     else:
-        print('FAILED!')
+        print("FAILED!")
 
-print('\n=== Timing info (for last set of kernel launches)')
+print("\n=== Timing info (for last set of kernel launches)")
 for k in range(n):
-    print(('Dataset', k)) 
+    print(("Dataset", k))
     for l in range(len(marker_names)):
-        print((marker_names[l], ':', ref.time_till(event[k][marker_names[l]])))
-
+        print((marker_names[l], ":", ref.time_till(event[k][marker_names[l]])))
diff --git a/examples/from-wiki/light_field_3d_viewer.py b/examples/from-wiki/light_field_3d_viewer.py
index 3f22e005a43c983631444babc97e8fbb35b0b0a3..44e235504ab3c14467863e68188ac1d5e70b6145 100644
--- a/examples/from-wiki/light_field_3d_viewer.py
+++ b/examples/from-wiki/light_field_3d_viewer.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 """
 3D display of Light Field images.
 Example images can be download from:
@@ -15,25 +15,25 @@ Prerequisites:
 
 Author: Amit Aides. amitibo at technion . ac . il
 """
+from __future__ import annotations
 
-
-from enthought.traits.api import HasTraits, Range, on_trait_change
-from enthought.traits.ui.api import View, Item
-from enthought.chaco.api import Plot, ArrayPlotData, gray
-from enthought.enable.component_editor import ComponentEditor
-
-import numpy as np
-import Image
 import argparse
-import os.path
+import contextlib
 import math
+import os.path
 
-import pycuda.driver as cuda
-import pycuda.compiler
-import pycuda.autoinit
-
+import Image
+import numpy as np
+from enthought.chaco.api import ArrayPlotData, Plot, gray
+from enthought.enable.component_editor import ComponentEditor
+from enthought.traits.api import HasTraits, Range, on_trait_change
+from enthought.traits.ui.api import Item, View
 from jinja2 import Template
 
+import pycuda.autoinit
+import pycuda.compiler
+import pycuda.driver as cuda
+
 
 _kernel_tpl = Template("""
 {% if NCHANNELS == 3 %}
@@ -64,7 +64,7 @@ __global__ void LFview_kernel(
     // calculate offset into destination array
     //
     unsigned int didx = (y * {{newiw}} + x) * {{NCHANNELS}};
-    
+
     //
     // calculate offset into source array (be aware of rotation and scaling)
     //
@@ -99,10 +99,10 @@ def ceil(x):
 class LFapplication(HasTraits):
 
     traits_view = View(
-        Item('LF_img', editor=ComponentEditor(), show_label=False),
-        Item('X_angle', label='Angle in the X axis'),
-        Item('Y_angle', label='Angle in the Y axis'),
-        resizable = True,
+        Item("LF_img", editor=ComponentEditor(), show_label=False),
+        Item("X_angle", label="Angle in the X axis"),
+        Item("Y_angle", label="Angle in the Y axis"),
+        resizable=True,
         title="LF Image"
         )
 
@@ -113,21 +113,19 @@ class LFapplication(HasTraits):
         # Load image data
         #
         base_path = os.path.splitext(img_path)[0]
-        lenslet_path = base_path + '-lenslet.txt'
-        optics_path = base_path + '-optics.txt'
+        lenslet_path = base_path + "-lenslet.txt"
+        optics_path = base_path + "-optics.txt"
 
         with open(lenslet_path) as f:
             tmp = eval(f.readline())
-            x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
+            x_offset, y_offset, right_dx, _right_dy, _down_dx, down_dy = \
               np.array(tmp, dtype=np.float32)
 
         with open(optics_path) as f:
             for line in f:
                 name, val = line.strip().split()
-                try:
+                with contextlib.suppress(Exception):
                     setattr(self, name, np.float32(val))
-                except:
-                    pass
 
         max_angle = math.atan(self.pitch/2/self.flen)
 
@@ -135,7 +133,7 @@ class LFapplication(HasTraits):
         # Prepare image
         #
         im_pil = Image.open(img_path)
-        if im_pil.mode == 'RGB':
+        if im_pil.mode == "RGB":
             self.NCHANNELS = 3
             w, h = im_pil.size
             im = np.zeros((h, w, 4), dtype=np.float32)
@@ -170,10 +168,10 @@ class LFapplication(HasTraits):
                 NCHANNELS=self.NCHANNELS
                 )
             )
-        
+
         self.LFview_func = mod_LFview.get_function("LFview_kernel")
         self.texref = mod_LFview.get_texref("tex")
-        
+
         #
         # Now generate the cuda texture
         #
@@ -184,7 +182,7 @@ class LFapplication(HasTraits):
                 )
         else:
             cuda.matrix_to_texref(im, self.texref, order="C")
-            
+
         #
         # We could set the next if we wanted to address the image
         # in normalized coordinates ( 0 <= coordinate < 1.)
@@ -195,9 +193,9 @@ class LFapplication(HasTraits):
         #
         # Prepare the traits
         #
-        self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
-        self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))
-        
+        self.add_trait("X_angle", Range(-max_angle, max_angle, 0.0))
+        self.add_trait("Y_angle", Range(-max_angle, max_angle, 0.0))
+
         self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
         self.LF_img = Plot(self.plotdata)
         if self.NCHANNELS == 3:
@@ -210,9 +208,9 @@ class LFapplication(HasTraits):
         # Get the output image
         #
         output = np.zeros(self.LF_dim, dtype=np.uint8)
-        
+
         #
-        # Calculate the gridsize. This is entirely given by the size of our image. 
+        # Calculate the gridsize. This is entirely given by the size of our image.
         #
         blocks = (16, 16, 1)
         gridx = ceil(self.LF_dim[1]/blocks[1])
@@ -233,22 +231,21 @@ class LFapplication(HasTraits):
 
         return output
 
-    @on_trait_change('X_angle, Y_angle')        
+    @on_trait_change("X_angle, Y_angle")
     def updateImge(self):
-        self.plotdata.set_data('LF_img', self.sampleLF())
-        
-        
+        self.plotdata.set_data("LF_img", self.sampleLF())
+
+
 def main(img_path):
     """Main function"""
 
     app = LFapplication(img_path)
     app.configure_traits()
-    
-    
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='View an LF image')
-    parser.add_argument('img_path', type=str, help='Path to LF image')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="View an LF image")
+    parser.add_argument("img_path", type=str, help="Path to LF image")
     args = parser.parse_args()
 
     main(args.img_path)
-
diff --git a/examples/from-wiki/mandelbrot.py b/examples/from-wiki/mandelbrot.py
index 92958d2bdfe894d2da3352b747b5e1a902643602..8c214cc78353fa5621a9ea384aa08d4e2d102c51 100644
--- a/examples/from-wiki/mandelbrot.py
+++ b/examples/from-wiki/mandelbrot.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 # Mandelbrot calculate using GPU, Serial numpy and faster numpy
 # Use to show the speed difference between CPU and GPU calculations
 # ian@ianozsvald.com July 2010
@@ -6,31 +6,33 @@
 # Based on vegaseat's TKinter/numpy example code from 2006
 # http://www.daniweb.com/code/snippet216851.html#
 # with minor changes to move to numpy from the obsolete Numeric
+from __future__ import annotations
 
 import sys
-import numpy as nm
-
 import tkinter as tk
-import Image          # PIL
-import ImageTk        # PIL
+
+import Image  # PIL
+import ImageTk  # PIL
+import numpy as nm
 
 import pycuda.driver as drv
-import pycuda.tools
-import pycuda.autoinit
-from pycuda.compiler import SourceModule
 import pycuda.gpuarray as gpuarray
 
+
 # set width and height of window, more pixels take longer to calculate
 w = 1000
 h = 1000
 
 from pycuda.elementwise import ElementwiseKernel
+
+
 complex_gpu = ElementwiseKernel(
         "pycuda::complex<float> *z, pycuda::complex<float> *q, int *iteration, int maxiter",
             "for (int n=0; n < maxiter; n++) {z[i] = (z[i]*z[i])+q[i]; if (abs(z[i]) > 2.0f) {iteration[i]=n; z[i] = pycuda::complex<float>(); q[i] = pycuda::complex<float>();};}",
         "complex5",
         preamble="#include <pycuda-complex.hpp>",)
 
+
 def calculate_z_gpu(q, maxiter, z):
     output = nm.resize(nm.array(0,), q.shape)
     q_gpu = gpuarray.to_gpu(q.astype(nm.complex64))
@@ -58,7 +60,7 @@ def calculate_z_numpy_gpu(q, maxiter, z):
     # we'll add 1 to iterg after each iteration
     iterg = gpuarray.to_gpu(nm.array([0]*zg.size).astype(nm.int32))
 
-    for iter in range(maxiter):
+    for _iter in range(maxiter):
         zg = zg*zg + qg
 
         # abs returns a complex (rather than a float) from the complex
@@ -84,11 +86,12 @@ def calculate_z_numpy(q, maxiter, z):
     for iter in range(maxiter):
         z = z*z + q
         done = nm.greater(abs(z), 2.0)
-        q = nm.where(done,0+0j, q)
-        z = nm.where(done,0+0j, z)
+        q = nm.where(done, 0+0j, q)
+        z = nm.where(done, 0+0j, z)
         output = nm.where(done, iter, output)
     return output
 
+
 def calculate_z_serial(q, maxiter, z):
     # calculate z using pure python with numpy arrays
     # this routine unrolls calculate_z_numpy as an intermediate
@@ -112,9 +115,8 @@ show_instructions = False
 if len(sys.argv) == 1:
     show_instructions = True
 
-if len(sys.argv) > 1:
-    if sys.argv[1] not in ['gpu', 'gpuarray', 'numpy', 'python']:
-        show_instructions = True
+if len(sys.argv) > 1 and sys.argv[1] not in ["gpu", "gpuarray", "numpy", "python"]:
+    show_instructions = True
 
 if show_instructions:
     print("Usage: python mandelbrot.py [gpu|gpuarray|numpy|python]")
@@ -125,13 +127,13 @@ if show_instructions:
     print(" python is a pure Python solution on the CPU with numpy arrays")
     sys.exit(0)
 
-routine = {'gpuarray':calculate_z_numpy_gpu,
-            'gpu':calculate_z_gpu,
-            'numpy':calculate_z_numpy,
-            'python':calculate_z_serial}
+routine = {"gpuarray": calculate_z_numpy_gpu,
+            "gpu": calculate_z_gpu,
+            "numpy": calculate_z_numpy,
+            "python": calculate_z_serial}
 
 calculate_z = routine[sys.argv[1]]
-##if sys.argv[1] == 'python':
+# if sys.argv[1] == 'python':
 #    import psyco
 #    psyco.full()
 
@@ -149,7 +151,8 @@ calculate_z = routine[sys.argv[1]]
 # numpy: 43.4s
 # python (serial): 1605.6s
 
-class Mandelbrot(object):
+
+class Mandelbrot:
     def __init__(self):
         # create window
         self.root = tk.Tk()
@@ -159,7 +162,6 @@ class Mandelbrot(object):
         # start event loop
         self.root.mainloop()
 
-
     def draw(self, x1, x2, y1, y2, maxiter=300):
         # draw the Mandelbrot set, from numpy example
         xx = nm.arange(x1, x2, (x2-x1)/w*2)
@@ -199,7 +201,7 @@ class Mandelbrot(object):
         self.label = tk.Label(self.root, image=self.image)
         self.label.pack()
 
+
 # test the class
-if __name__ == '__main__':
+if __name__ == "__main__":
     test = Mandelbrot()
-
diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py
index 66c3a011d39e0a4ddaf991e9e5591a808613b1b9..4b81767c2bc7b882844555e4b0a45e51909340c7 100644
--- a/examples/from-wiki/mandelbrot_interactive.py
+++ b/examples/from-wiki/mandelbrot_interactive.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 # Interactive Mandelbrot Set Accelerated using PyCUDA
 # Classical Iteration Method
 # Luis Villasenor
@@ -7,11 +7,11 @@
 # Licence: GPLv3
 
 # Usage
-# Use the left buttom to draw a square to zoom into 
+# Use the left buttom to draw a square to zoom into
 
 # Point and click with the right buttom to magnify by a factor of 10
 
-# Click with the left button on the right side of the 
+# Click with the left button on the right side of the
 # image to randomly change the colormap
 
 # Click with right button on the right side of the image to set the default colormap
@@ -27,30 +27,31 @@
 # Type 'f' to toggle full-screen mode
 
 # Type 's' to save the image
+from __future__ import annotations
 
-
-import pycuda.driver as drv
-import pycuda.tools
-import pycuda.autoinit
-import numpy as np
-from pylab import cm as cm
 import matplotlib.pyplot as plt
-from matplotlib.widgets import RectangleSelector
+import numpy as np
 from matplotlib.patches import Rectangle
+from matplotlib.widgets import RectangleSelector
+from pylab import cm as cm
+
+import pycuda.driver as drv
 from pycuda.compiler import SourceModule
-global N,n_block,n_grid,x0,y0,side,L,M,power
-L=400;
-N=800;n_block=16;n_grid=int(N/16);
-N=n_block*n_grid;
-x0=-.5;y0=0.
-side=3.0
-i_cmap=49
-power=2
-fig = plt.figure(figsize=(12,12))
-fig.suptitle('Interactive Mandelbrot Set, Accelerated with PyCUDA')
+
+
+global N, n_block, n_grid, x0, y0, side, L, M, power
+L = 400
+N = 800; n_block = 16; n_grid = int(N/16)
+N = n_block*n_grid
+x0 = -.5; y0 = 0.
+side = 3.0
+i_cmap = 49
+power = 2
+fig = plt.figure(figsize=(12, 12))
+fig.suptitle("Interactive Mandelbrot Set, Accelerated with PyCUDA")
 ax = fig.add_subplot(111)
-cmaps=[m for m in cm.datad if not m.endswith("_r")]
-N,x0,y0,side,L,power
+cmaps = [m for m in cm.datad if not m.endswith("_r")]
+N, x0, y0, side, L, power
 
 mod = SourceModule("""
     #include <stdio.h>
@@ -78,123 +79,130 @@ __global__ void mandelbrot(double x0, double y0,double side, int L,int power,int
     M[threadId]=h;
 }
 """)
-M = np.zeros((N,N)).astype(np.int32)
+M = np.zeros((N, N)).astype(np.int32)
 func = mod.get_function("mandelbrot")
-func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
+func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+
+
 def zoom_on_square(eclick, erelease):
-    'eclick and erelease are the press and release events'
-    global N,side,x0,y0,myobj,M,power
-    x1, y1 = min(eclick.xdata,erelease.xdata),min( eclick.ydata,erelease.ydata)
-    x2, y2 = max(eclick.xdata,erelease.xdata),max( eclick.ydata,erelease.ydata)
-    #print(" The button you used were: %s %s" % (eclick.button, erelease.button))
-    #print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x1, y1, x0,y0)
-    #print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x2, y2, x0,y0)
-    x_1=x0+side*(x1-N/2.)/N
-    y_1=y0+side*(y1-N/2.)/N
-    x_2=x0+side*(x2-N/2.)/N
-    y_2=y0+side*(y2-N/2.)/N
-    x0=(x_2+x_1)/2.
-    y0=(y_2+y_1)/2.
-    side=side*(x2-x1+y2-y1)/N/2 # Average of the 2 rectangle sides
-    func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))    
-    myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap])
+    "eclick and erelease are the press and release events"
+    global N, side, x0, y0, myobj, M, power
+    x1, y1 = min(eclick.xdata, erelease.xdata), min(eclick.ydata, erelease.ydata)
+    x2, y2 = max(eclick.xdata, erelease.xdata), max(eclick.ydata, erelease.ydata)
+    # print(" The button you used were: %s %s" % (eclick.button, erelease.button))
+    # print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x1, y1, x0,y0)
+    # print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x2, y2, x0,y0)
+    x_1 = x0+side*(x1-N/2.)/N
+    y_1 = y0+side*(y1-N/2.)/N
+    x_2 = x0+side*(x2-N/2.)/N
+    y_2 = y0+side*(y2-N/2.)/N
+    x0 = (x_2+x_1)/2.
+    y0 = (y_2+y_1)/2.
+    side = side*(x2-x1+y2-y1)/N/2  # Average of the 2 rectangle sides
+    func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+    myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap])
     myobj.set_data(M)
-    ax.add_patch(Rectangle((1 - .1, 1 - .1), 0.2, 0.2,alpha=1, facecolor='none',fill=None, ))
-    ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    ax.add_patch(Rectangle((1 - .1, 1 - .1), 0.2, 0.2, alpha=1, facecolor="none", fill=None, ))
+    ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
     plt.draw()
 
+
 def key_selector(event):
-    global N,side,x0,y0,myobj,M,power,L,i_cmap,n_grid
-    #print(' Key pressed.')
-    if event.key == 'up':  # Increase max number of iterations
-        L=int(L*1.2);
-        print(("Maximum number of iterations changed to %d" % L))
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    global N, side, x0, y0, myobj, M, power, L, i_cmap, n_grid
+    # print(' Key pressed.')
+    if event.key == "up":  # Increase max number of iterations
+        L = int(L*1.2)
+        print("Maximum number of iterations changed to %d" % L)
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if event.key == 'down':  # Decrease max number of iterations
-        L=int(L/1.2);
-        print(("Maximum number of iterations changed to %d" % L))
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    if event.key == "down":  # Decrease max number of iterations
+        L = int(L/1.2)
+        print("Maximum number of iterations changed to %d" % L)
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if event.key == 'right':  # Increase  number of pixels
-        N=int(N*1.2);
-        n_grid=int(N/16.);
-        N=n_block*n_grid;
-        M = np.zeros((N,N)).astype(np.int32)
-        print(("Number of pixels per dimension changed to %d" % N))
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    if event.key == "right":  # Increase  number of pixels
+        N = int(N*1.2)
+        n_grid = int(N/16.)
+        N = n_block*n_grid
+        M = np.zeros((N, N)).astype(np.int32)
+        print("Number of pixels per dimension changed to %d" % N)
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if event.key == 'left':  # Decrease  number of pixels
-        N=int(N/1.2);
-        n_grid=int(N/16.);
-        N=n_block*n_grid;
-        M = np.zeros((N,N)).astype(np.int32)
-        print(("Number of pixels per dimension changed to %d" % N))
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    if event.key == "left":  # Decrease  number of pixels
+        N = int(N/1.2)
+        n_grid = int(N/16.)
+        N = n_block*n_grid
+        M = np.zeros((N, N)).astype(np.int32)
+        print("Number of pixels per dimension changed to %d" % N)
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if event.key in ['1','2','3','4','5','6','7','8','9'] :  # Decrease  number of pixels
-        power=int(event.key)
-        if power <10 and power >0 : 
-            print(("Power index set to %d" % power))
-            i_cmap=49
-            side=3.0; x0=-.5;y0=0.;L=200;
-            func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))            
-            myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-            ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    if event.key in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]:  # Decrease  number of pixels
+        power = int(event.key)
+        if power < 10 and power > 0:
+            print("Power index set to %d" % power)
+            i_cmap = 49
+            side = 3.0; x0 = -.5; y0 = 0.; L = 200
+            func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+            myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+            ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
             plt.draw()
 
+
 key_selector.RS = RectangleSelector(ax, zoom_on_square,
-                                       drawtype='box', useblit=True,
+                                       drawtype="box", useblit=True,
                                        button=[1, 3],  # don't use middle button
                                        minspanx=5, minspany=5,
-                                       spancoords='pixels')
+                                       spancoords="pixels")
+
+
 #                                     interactive=False)
 def zoom_on_point(event):
-    global N,side,x0,y0,myobj,L,M,i_cmap,power
-    #print(" Button pressed: %d" % (event.button))
-    #print(' event.x= %f, event.y= %f '%(event.x,event.y))
-    if event.button==3 and event.inaxes: # Zoom on clicked point; new side=10% of old side
+    global N, side, x0, y0, myobj, L, M, i_cmap, power
+    # print(" Button pressed: %d" % (event.button))
+    # print(' event.x= %f, event.y= %f '%(event.x,event.y))
+    if event.button == 3 and event.inaxes:  # Zoom on clicked point; new side=10% of old side
         x1, y1 = event.xdata, event.ydata
-        x0=x0+side*(x1-N/2.)/N
-        y0=y0+side*(y1-N/2.)/N
-        side=side*.1
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap])
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+        x0 = x0+side*(x1-N/2.)/N
+        y0 = y0+side*(y1-N/2.)/N
+        side = side*.1
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap])
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if not event.inaxes and event.x<.3*N : # Click on left side of image to reset to full fractal
-        power=2; side=3.0; x0=-.5;y0=0.;i_cmap=49
-        func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))        
-        myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower')
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
+    if not event.inaxes and event.x < .3*N:  # Click on left side of image to reset to full fractal
+        power = 2; side = 3.0; x0 = -.5; y0 = 0.; i_cmap = 49
+        func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+        myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower")
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
+        plt.draw()
+    if event.button == 1 and not event.inaxes and event.x > .7*N:  # Left click on right side of image to set a random colormap
+        i_cmap_current = i_cmap
+        i_cmap = np.random.randint(len(cmaps))
+        if i_cmap == i_cmap_current:
+            i_cmap -= 1
+            if i_cmap < 0: i_cmap = len(cmaps)-1
+        # print("color=",i_cmap)
+        myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap])
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
+        plt.draw()
+    if event.button == 3 and not event.inaxes and event.x > .7*N:  # Right click on right side to set default mapolormap
+        i_cmap = 49
+        myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap])
+        ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
         plt.draw()
-    if event.button==1 and not event.inaxes and event.x>.7*N : # Left click on right side of image to set a random colormap
-        i_cmap_current=i_cmap
-        i_cmap=np.random.randint(len(cmaps))
-        if i_cmap==i_cmap_current:
-            i_cmap-=1
-            if i_cmap< 0 : i_cmap=len(cmaps)-1
-        #print("color=",i_cmap) 
-        myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap])
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
-        plt.draw() 
-    if event.button==3 and not event.inaxes and event.x>.7*N : # Right click on right side to set default mapolormap 
-        i_cmap=49
-        myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap])
-        ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
-        plt.draw()   
-fig.canvas.mpl_connect('button_press_event', zoom_on_point)
-fig.canvas.mpl_connect('key_press_event', key_selector)
-func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
-ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L))
-plt.imshow(M,origin='lower',cmap=cmaps[i_cmap])
-plt.show()
 
+
+fig.canvas.mpl_connect("button_press_event", zoom_on_point)
+fig.canvas.mpl_connect("key_press_event", key_selector)
+func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1))
+ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L))
+plt.imshow(M, origin="lower", cmap=cmaps[i_cmap])
+plt.show()
diff --git a/examples/from-wiki/manhattan_distance_for_2D_array.py b/examples/from-wiki/manhattan_distance_for_2D_array.py
index f92ab1e16d80c8d22ddf80e73ea919284f5f8435..6a71d7d956e4357064d14a53f6f847dc8cc480cb 100644
--- a/examples/from-wiki/manhattan_distance_for_2D_array.py
+++ b/examples/from-wiki/manhattan_distance_for_2D_array.py
@@ -1,11 +1,12 @@
-#!python 
+#!python
+from __future__ import annotations
 
 import numpy
-import pycuda.autoinit
-import pycuda.driver as cuda
 
+import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 
+
 w = 7
 
 mod = SourceModule("""
@@ -35,24 +36,24 @@ def diffusion(res):
     y = numpy.float32(1)
     z = numpy.float32(1)
 
-
     height, width = numpy.int32(len(res)), numpy.int32(len(res[0]))
 
     diff_func(
         cuda.InOut(res),
         width,
-        height,x,y,z,
-        block=(w,w,1)
+        height, x, y, z,
+        block=(w, w, 1)
         )
 
+
 def run(res, step):
 
     diffusion(res)
     print(res)
 
-res   = numpy.array([[0 \
-                        for _ in range(0, w)]\
-                        for _ in range(0, w)], dtype='int32')
+
+res   = numpy.array([[0
+                        for _ in range(0, w)]
+                        for _ in range(0, w)], dtype="int32")
 print(res)
 run(res, 0)
-
diff --git a/examples/from-wiki/matrix_transpose.py b/examples/from-wiki/matrix_transpose.py
index bc2428d76acfb51127215ee81d7ea5c7cbb8a5fd..a2e66d2543414d078b716deedddc619ad7b885a8 100644
--- a/examples/from-wiki/matrix_transpose.py
+++ b/examples/from-wiki/matrix_transpose.py
@@ -1,22 +1,23 @@
-#!python 
+#!python
 # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
 
 # Transposition of a matrix
 # by Hendrik Riedmann <riedmann@dam.brown.edu>
+from __future__ import annotations
 
+import numpy
+import numpy.linalg as la
 
+import pycuda.autoinit
 import pycuda.driver as cuda
 import pycuda.gpuarray as gpuarray
-import pycuda.autoinit
 from pycuda.compiler import SourceModule
-
-import numpy
-import numpy.linalg as la
-
 from pycuda.tools import context_dependent_memoize
 
+
 block_size = 16
 
+
 @context_dependent_memoize
 def _get_transpose_kernel():
     mod = SourceModule("""
@@ -46,12 +47,13 @@ def _get_transpose_kernel():
         // Write transposed submatrix to global memory
         A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
     }
-    """% {"block_size": block_size})
+    """ % {"block_size": block_size})
 
     func = mod.get_function("transpose")
     func.prepare("PPii")
 
     from pytools import Record
+
     class TransposeKernelInfo(Record): pass
 
     return TransposeKernelInfo(func=func,
@@ -60,7 +62,6 @@ def _get_transpose_kernel():
             granularity=block_size)
 
 
-
 def _get_big_block_transpose_kernel():
     mod = SourceModule("""
     #define BLOCK_SIZE %(block_size)d
@@ -101,12 +102,13 @@ def _get_big_block_transpose_kernel():
         A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] =
     A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE];
     }
-      """% {"block_size": block_size})
+      """ % {"block_size": block_size})
 
     func = mod.get_function("transpose")
     func.prepare("PPii")
 
     from pytools import Record
+
     class TransposeKernelInfo(Record): pass
 
     return TransposeKernelInfo(func=func,
@@ -115,8 +117,6 @@ def _get_big_block_transpose_kernel():
             granularity=2*block_size)
 
 
-
-
 def _transpose(tgt, src):
     krnl = _get_transpose_kernel()
 
@@ -130,8 +130,6 @@ def _transpose(tgt, src):
                     tgt.gpudata, src.gpudata, w, h)
 
 
-
-
 def transpose(src):
     w, h = src.shape
 
@@ -140,9 +138,6 @@ def transpose(src):
     return result
 
 
-
-
-
 def check_transpose():
     from pycuda.curandom import rand
 
@@ -163,8 +158,6 @@ def check_transpose():
         assert err_norm == 0, (size, err_norm)
 
 
-
-
 def run_benchmark():
     from pycuda.curandom import rand
 
@@ -183,7 +176,7 @@ def run_benchmark():
 
         warmup = 2
 
-        for i in range(warmup):
+        for _i in range(warmup):
             _transpose(target, source)
 
         count = 10
@@ -191,7 +184,7 @@ def run_benchmark():
         cuda.Context.synchronize()
         start.record()
 
-        for i in range(count):
+        for _i in range(count):
             _transpose(target, source)
 
         stop.record()
@@ -206,21 +199,17 @@ def run_benchmark():
     slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
     print(("Sizes for which bandwidth was low:", slow_sizes))
     print(("Ditto, mod 64:", [s % 64 for s in slow_sizes]))
-    from matplotlib.pyplot import semilogx, loglog, show, savefig, clf, xlabel, ylabel
-    xlabel('matrix size')
-    ylabel('bandwidth')
+    from matplotlib.pyplot import clf, loglog, savefig, semilogx, xlabel, ylabel
+    xlabel("matrix size")
+    ylabel("bandwidth")
     semilogx(sizes, bandwidths)
     savefig("transpose-bw.png")
     clf()
-    xlabel('matrix size')
-    ylabel('time')
+    xlabel("matrix size")
+    ylabel("time")
     loglog(sizes, times)
     savefig("transpose-times.png")
 
 
-
-
-#check_transpose()
+# check_transpose()
 run_benchmark()
-
-
diff --git a/examples/from-wiki/matrixmul_simple.py b/examples/from-wiki/matrixmul_simple.py
index 13e286d24fbaaa8c4152703989f9307ec45dc74e..4b37362ddbeb45aea24364112c986740aab3dee8 100644
--- a/examples/from-wiki/matrixmul_simple.py
+++ b/examples/from-wiki/matrixmul_simple.py
@@ -1,17 +1,18 @@
-#!python 
-#!/usr/bin/env python
+#!python
+# !/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-""" 
-Multiplies two square matrices together using a *single* block of threads and 
+"""
+Multiplies two square matrices together using a *single* block of threads and
 global memory only. Each thread computes one element of the resulting matrix.
 """
+from __future__ import annotations
 
 import numpy as np
-from pycuda import driver, compiler, gpuarray, tools
 
 # -- initialize the device
-import pycuda.autoinit
+from pycuda import compiler, gpuarray
+
 
 kernel_code_template = """
 __global__ void MatrixMulKernel(float *a, float *b, float *c)
@@ -24,7 +25,7 @@ __global__ void MatrixMulKernel(float *a, float *b, float *c)
     // that is computed by the thread
     float Pvalue = 0;
 
-    // Each thread loads one row of M and one column of N, 
+    // Each thread loads one row of M and one column of N,
     //   to produce one element of P.
     for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
         float Aelement = a[ty * %(MATRIX_SIZE)s + k];
@@ -52,20 +53,20 @@ b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
 # compute reference on the CPU to verify GPU computation
 c_cpu = np.dot(a_cpu, b_cpu)
 
-# transfer host (CPU) memory to device (GPU) memory 
-a_gpu = gpuarray.to_gpu(a_cpu) 
+# transfer host (CPU) memory to device (GPU) memory
+a_gpu = gpuarray.to_gpu(a_cpu)
 b_gpu = gpuarray.to_gpu(b_cpu)
 
 # create empty gpu array for the result (C = A * B)
 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
 
-# get the kernel code from the template 
+# get the kernel code from the template
 # by specifying the constant MATRIX_SIZE
 kernel_code = kernel_code_template % {
-    'MATRIX_SIZE': MATRIX_SIZE 
+    "MATRIX_SIZE": MATRIX_SIZE
     }
 
-# compile the kernel code 
+# compile the kernel code
 mod = compiler.SourceModule(kernel_code)
 
 # get the kernel function from the compiled module
@@ -74,11 +75,11 @@ matrixmul = mod.get_function("MatrixMulKernel")
 # call the kernel on the card
 matrixmul(
     # inputs
-    a_gpu, b_gpu, 
+    a_gpu, b_gpu,
     # output
-    c_gpu, 
+    c_gpu,
     # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
-    block = (MATRIX_SIZE, MATRIX_SIZE, 1),
+    block=(MATRIX_SIZE, MATRIX_SIZE, 1),
     )
 
 # print the results
@@ -99,5 +100,3 @@ print("CPU-GPU difference:")
 print(c_cpu - c_gpu.get())
 
 np.allclose(c_cpu, c_gpu.get())
-
-
diff --git a/examples/from-wiki/matrixmul_tiled.py b/examples/from-wiki/matrixmul_tiled.py
index 225075ba96d7247f106afad166a83474e8d32596..0a50ac66be97487885dd1f47278a4da730795cb4 100644
--- a/examples/from-wiki/matrixmul_tiled.py
+++ b/examples/from-wiki/matrixmul_tiled.py
@@ -1,29 +1,30 @@
-#!python 
-#!/usr/bin/env python
+#!python
+# !/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 
-""" 
-Multiples two square matrices together using multiple blocks and shared memory. 
+"""
+Multiples two square matrices together using multiple blocks and shared memory.
 Each thread block is assigned a "tile" of the resulting matrix and is responsible
-for generating the elements in that tile.  Each thread in a block computes one element 
+for generating the elements in that tile.  Each thread in a block computes one element
 of the tile.
 """
+from __future__ import annotations
 
 import numpy as np
 from numpy import linalg as la
-from pycuda import driver, compiler, gpuarray, tools
 
 # -- initialize the device
-import pycuda.autoinit
+from pycuda import compiler, gpuarray
+
 
 kernel_code_template = """
 __global__ void MatrixMulKernel(float *A, float *B, float *C)
 {
 
   const uint wA = %(MATRIX_SIZE)s;
-  const uint wB = %(MATRIX_SIZE)s;  
-  
+  const uint wB = %(MATRIX_SIZE)s;
+
   // Block index
   const uint bx = blockIdx.x;
   const uint by = blockIdx.y;
@@ -51,7 +52,7 @@ __global__ void MatrixMulKernel(float *A, float *B, float *C)
   // compute the block sub-matrix
   for (int a = aBegin, b = bBegin;
        a <= aEnd;
-       a += aStep, b += bStep) 
+       a += aStep, b += bStep)
     {
       // Shared memory for the sub-matrix of A
       __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
@@ -87,7 +88,7 @@ __global__ void MatrixMulKernel(float *A, float *B, float *C)
 # define the (square) matrix size
 MATRIX_SIZE = 4
 
-# define size of blocks and tiles sub-matrix 
+# define size of blocks and tiles sub-matrix
 # (we assume that the block size is same as tile size)
 TILE_SIZE = 2
 BLOCK_SIZE = TILE_SIZE
@@ -99,18 +100,18 @@ b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
 # compute reference on the CPU to verify GPU computation
 c_cpu = np.dot(a_cpu, b_cpu)
 
-# transfer host (CPU) memory to device (GPU) memory 
-a_gpu = gpuarray.to_gpu(a_cpu) 
+# transfer host (CPU) memory to device (GPU) memory
+a_gpu = gpuarray.to_gpu(a_cpu)
 b_gpu = gpuarray.to_gpu(b_cpu)
 
 # create empty gpu array for the result (C = A * B)
 c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
 
-# get the kernel code from the template 
+# get the kernel code from the template
 # by specifying the constants MATRIX_SIZE and BLOCK_SIZE
-kernel_code = kernel_code_template % { 
-    'MATRIX_SIZE': MATRIX_SIZE,
-    'BLOCK_SIZE': BLOCK_SIZE,
+kernel_code = kernel_code_template % {
+    "MATRIX_SIZE": MATRIX_SIZE,
+    "BLOCK_SIZE": BLOCK_SIZE,
     }
 
 # compile the kernel code
@@ -122,13 +123,13 @@ matrixmul = mod.get_function("MatrixMulKernel")
 # call the kernel on the card
 matrixmul(
     # inputs
-    a_gpu, b_gpu, 
+    a_gpu, b_gpu,
     # output
-    c_gpu, 
+    c_gpu,
     # grid of multiple blocks
-    grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
+    grid=(MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
     # block of multiple threads
-    block = (TILE_SIZE, TILE_SIZE, 1), 
+    block=(TILE_SIZE, TILE_SIZE, 1),
     )
 
 # print the results
@@ -149,5 +150,3 @@ print("CPU-GPU difference:")
 print(c_cpu - c_gpu.get())
 print("L2 norm:", la.norm(c_cpu - c_gpu.get()))
 np.allclose(c_cpu, c_gpu.get())
-
-
diff --git a/examples/from-wiki/measure_gpuarray_speed_random.py b/examples/from-wiki/measure_gpuarray_speed_random.py
index fbd793d479d4f4836a346d9aa5352b2a83684ca4..517863408455e7651d9e003c6770dd3ee3d88821 100644
--- a/examples/from-wiki/measure_gpuarray_speed_random.py
+++ b/examples/from-wiki/measure_gpuarray_speed_random.py
@@ -1,16 +1,15 @@
-#!python 
-#! /usr/bin/env python
+#!python
+# ! /usr/bin/env python
 # DO NOT USE THIS AS A BENCHMARK. See
 # http://documen.tician.de/pycuda/array.html#pycuda.curandom.rand
+from __future__ import annotations
 
-import pycuda.autoinit
-import pycuda.driver as drv
-import pycuda.curandom as curandom
 import numpy
-import numpy.linalg as la
-from pytools import Table
 
+from pytools import Table
 
+import pycuda.curandom as curandom
+import pycuda.driver as drv
 
 
 def main():
@@ -22,72 +21,67 @@ def main():
     flopsCPU = []
     timesCPU = []
 
-    for power in range(10, 25): # 24
-        size = 1<<power
+    for power in range(10, 25):  # 24
+        size = 1 << power
         print(size)
         sizes.append(size)
-        a = gpuarray.zeros((size,), dtype=numpy.float32)
+        gpuarray.zeros((size,), dtype=numpy.float32)
 
-        if power > 20:
-            count = 100
-        else:
-            count = 1000
+        count = 100 if power > 20 else 1000
 
-        #start timer
+        # start timer
         start = drv.Event()
         end = drv.Event()
         start.record()
 
-        #cuda operation which fills the array with random numbers
-        for i in range(count):
+        # cuda operation which fills the array with random numbers
+        for _i in range(count):
             curandom.rand((size, ))
 
-        #stop timer
+        # stop timer
         end.record()
         end.synchronize()
 
-        #calculate used time
+        # calculate used time
         secs = start.time_till(end)*1e-3
 
         times.append(secs/count)
         flops.append(size)
 
-        #cpu operations which fills teh array with random data
-        a = numpy.array((size,), dtype=numpy.float32)
+        # cpu operations which fills teh array with random data
+        numpy.array((size,), dtype=numpy.float32)
 
-        #start timer
+        # start timer
         start = drv.Event()
         end = drv.Event()
         start.record()
 
-        #cpu operation which fills the array with random data
-        for i in range(count):
+        # cpu operation which fills the array with random data
+        for _i in range(count):
             numpy.random.rand(size).astype(numpy.float32)
 
-        #stop timer
+        # stop timer
         end.record()
         end.synchronize()
 
-        #calculate used time
+        # calculate used time
         secs = start.time_till(end)*1e-3
 
-        #add results to variable
+        # add results to variable
         timesCPU.append(secs/count)
         flopsCPU.append(size)
 
+    # calculate pseudo flops
+    flops = [f/t for f, t in zip(flops, times)]
+    flopsCPU = [f/t for f, t in zip(flopsCPU, timesCPU)]
 
-    #calculate pseudo flops
-    flops = [f/t for f, t in zip(flops,times)]
-    flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]
-
-    #print the data out
+    # print the data out
     tbl = Table()
-    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
-    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
-        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
+    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup"))
+    for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU):
+        tbl.add_row((s, t, f, tCpu, fCpu, f/fCpu))
     print(tbl)
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/examples/from-wiki/multiple_threads.py b/examples/from-wiki/multiple_threads.py
index a067766d6b0829dbca83527f6339130483194fef..ae51c8aab4e7524149f8941933bff06cf470fdf2 100644
--- a/examples/from-wiki/multiple_threads.py
+++ b/examples/from-wiki/multiple_threads.py
@@ -1,13 +1,15 @@
-#!python 
+#!python
 # Derived from a test case by Chris Heuser
 # Also see FAQ about PyCUDA and threads.
+from __future__ import annotations
 
+import threading
+
+import numpy
 
-import pycuda
 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
-import threading
-import numpy
+
 
 class GPUThread(threading.Thread):
     def __init__(self, number, some_array):
@@ -30,6 +32,7 @@ class GPUThread(threading.Thread):
         del self.array_gpu
         del self.ctx
 
+
 def test_kernel(input_array_gpu):
     mod = SourceModule("""
         __global__ void f(float * out, float * in)
@@ -40,18 +43,19 @@ def test_kernel(input_array_gpu):
         """)
     func = mod.get_function("f")
 
-    output_array = numpy.zeros((1,512))
+    output_array = numpy.zeros((1, 512))
     output_array_gpu = cuda.mem_alloc(output_array.nbytes)
 
     func(output_array_gpu,
           input_array_gpu,
-          block=(512,1,1))
+          block=(512, 1, 1))
     cuda.memcpy_dtoh(output_array, output_array_gpu)
 
     return output_array
 
+
 cuda.init()
-some_array = numpy.ones((1,512), dtype=numpy.float32)
+some_array = numpy.ones((1, 512), dtype=numpy.float32)
 num = cuda.Device.count()
 
 gpu_thread_list = []
@@ -59,5 +63,3 @@ for i in range(num):
     gpu_thread = GPUThread(i, some_array)
     gpu_thread.start()
     gpu_thread_list.append(gpu_thread)
-
-
diff --git a/examples/from-wiki/plot_random_data.py b/examples/from-wiki/plot_random_data.py
index 2e0118c3fe59e78d53ca77d85abcaf89ba1e2495..9f0ef4195b2dc1d0f2dc0703b31f3fb5509d419e 100644
--- a/examples/from-wiki/plot_random_data.py
+++ b/examples/from-wiki/plot_random_data.py
@@ -1,24 +1,25 @@
-#!python 
+#!python
 # simple module to show the plotting of random data
+from __future__ import annotations
 
-import pycuda.autoinit
 import pycuda.curandom as curandom
 
+
 size = 1000
 a = curandom.rand((size,)).get()
 
 from matplotlib.pylab import *
+
+
 subplot(211)
 plot(a)
 grid(True)
-ylabel('plot - gpu')
+ylabel("plot - gpu")
 
 subplot(212)
 hist(a, 100)
 grid(True)
-ylabel('histogram - gpu')
-
-#and save it
-savefig('plot-random-data')
-
+ylabel("histogram - gpu")
 
+# and save it
+savefig("plot-random-data")
diff --git a/examples/from-wiki/rotate.py b/examples/from-wiki/rotate.py
index 0adc26ba8f480f4571052d9693046fe26d811a64..1c89ba010d74d946fb3af500529d40b66362001b 100644
--- a/examples/from-wiki/rotate.py
+++ b/examples/from-wiki/rotate.py
@@ -1,23 +1,27 @@
-#!python 
-#!/usr/bin/env python -tt
+#!python
+# !/usr/bin/env python -tt
 # encoding: utf-8
 #
 # Created by Holger Rapp on 2009-03-11.
 # HolgerRapp@gmx.net
 #
+from __future__ import annotations
+
+from math import cos, pi, sin
 
-import pycuda.driver as cuda
-import pycuda.compiler
-import pycuda.autoinit
 import numpy
-from math import pi,cos,sin
+
+import pycuda.autoinit
+import pycuda.compiler
+import pycuda.driver as cuda
+
 
 _rotation_kernel_source = """
 texture<float, 2> tex;
 
 __global__ void copy_texture_kernel(
-    const float resize_val, 
-    const float alpha, 
+    const float resize_val,
+    const float alpha,
     unsigned short oldiw, unsigned short oldih,
     unsigned short newiw, unsigned short newih,
     unsigned char* data) {
@@ -25,38 +29,39 @@ __global__ void copy_texture_kernel(
         // calculate pixel idx
         unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
         unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-        
+
         // We might be outside the reachable pixels. Don't do anything
         if( (x >= newiw) || (y >= newih) )
             return;
-        
+
         // calculate offset into destination array
         unsigned int didx = y * newiw + x;
-        
+
         // calculate offset into source array (be aware of rotation and scaling)
         float xmiddle = (x-newiw/2.) / resize_val;
         float ymiddle = (y-newih/2.) / resize_val;
         float sx = ( xmiddle*cos(alpha)+ymiddle*sin(alpha) + oldiw/2.) ;
         float sy = ( -xmiddle*sin(alpha)+ymiddle*cos(alpha) + oldih/2.);
-        
-        if( (sx < 0) || (sx >= oldiw) || (sy < 0) || (sy >= oldih) ) { 
-            data[didx] = 255; 
+
+        if( (sx < 0) || (sx >= oldiw) || (sy < 0) || (sy >= oldih) ) {
+            data[didx] = 255;
             return;
         }
 
         data[didx] = tex2D(tex, sx, sy);
     }
 """
-mod_copy_texture=pycuda.compiler.SourceModule( _rotation_kernel_source )
+mod_copy_texture = pycuda.compiler.SourceModule(_rotation_kernel_source)
 
 copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
 texref = mod_copy_texture.get_texref("tex")
 
-def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks = (16,16,1)  ):
+
+def rotate_image(a, resize=1.5, angle=20., interpolation="linear", blocks=(16, 16, 1)):
     """
     Rotates the array. The new array has the new size and centers the
     picture in the middle.
-    
+
     a             - array (2-dim)
     resize        - new_image w/old_image w
     angle         - degrees to rotate the image
@@ -67,72 +72,74 @@ def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks
     returns: a new array with dtype=uint8 containing the rotated image
     """
     angle = angle/180. *pi
-    
-    # Convert this image to float. Unsigned int texture gave 
+
+    # Convert this image to float. Unsigned int texture gave
     # strange results for me. This conversion is slow though :(
     a = a.astype("float32")
 
     # Calculate the dimensions of the new image
-    calc_x = lambda x_y: (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle))
-    calc_y = lambda x_y1: (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle))
+    def calc_x(x_y):
+        return (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle))
 
-    xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
-    ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
+    def calc_y(x_y1):
+        return (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle))
+
+    xs = [calc_x(p) for p in [(-1., -1.), (1., -1.), (1., 1.), (-1., 1.)]]
+    ys = [calc_y(p) for p in [(-1., -1.), (1., -1.), (1., 1.), (-1., 1.)]]
 
     new_image_dim = (
         int(numpy.ceil(max(ys)-min(ys))*resize),
         int(numpy.ceil(max(xs)-min(xs))*resize),
     )
-    
+
     # Now generate the cuda texture
     cuda.matrix_to_texref(a, texref, order="C")
-    
+
     # We could set the next if we wanted to address the image
     # in normalized coordinates ( 0 <= coordinate < 1.)
     # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
     if interpolation == "linear":
         texref.set_filter_mode(cuda.filter_mode.LINEAR)
 
-    # Calculate the gridsize. This is entirely given by the size of our image. 
+    # Calculate the gridsize. This is entirely given by the size of our image.
     gridx = new_image_dim[0]/blocks[0] if \
-            new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1
+            new_image_dim[0] % blocks[0] == 1 else new_image_dim[0]/blocks[0] +1
     gridy = new_image_dim[1]/blocks[1] if \
-            new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1
+            new_image_dim[1] % blocks[1] == 0 else new_image_dim[1]/blocks[1] +1
 
     # Get the output image
-    output = numpy.zeros(new_image_dim,dtype="uint8")
-    
+    output = numpy.zeros(new_image_dim, dtype="uint8")
+
     # Call the kernel
     copy_texture_func(
         numpy.float32(resize), numpy.float32(angle),
         numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]),
         numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]),
-            cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy))
-    
+            cuda.Out(output), texrefs=[texref], block=blocks, grid=(gridx, gridy))
+
     return output
 
-if __name__ == '__main__':
-    import Image
+
+if __name__ == "__main__":
     import sys
-    
-    def main( ):
+
+    import Image
+
+    def main():
         if len(sys.argv) != 2:
             print("You should really read the source...\n\nUsage: rotate.py <Imagename>\n")
             sys.exit(-1)
 
         # Open, convert to grayscale, convert to numpy array
         img = Image.open(sys.argv[1]).convert("L")
-        i = numpy.fromstring(img.tostring(),dtype="uint8").reshape(img.size[1],img.size[0])
-        
+        i = numpy.fromstring(img.tostring(), dtype="uint8").reshape(img.size[1], img.size[0])
+
         # Rotate & convert back to PIL Image
         irot = rotate_image(i)
-        rotimg = Image.fromarray(irot,mode="L")
+        rotimg = Image.fromarray(irot, mode="L")
 
         # Save and display
         rotimg.save("rotated.png")
         rotimg.show()
-    
-    main()
-
-
 
+    main()
diff --git a/examples/from-wiki/select_to_list.py b/examples/from-wiki/select_to_list.py
index b221cccd43ffbca7c7e75c3128d16e3fff98c781..ff3d34c54e3ee4ef6e41da4106582b31cfedfb88 100644
--- a/examples/from-wiki/select_to_list.py
+++ b/examples/from-wiki/select_to_list.py
@@ -1,14 +1,16 @@
-#!python 
+#!python
 # Exercise 2 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
 
 # Generate an array of random numbers between 0 and 1
 # List the indices of those numbers that are greater than a given limit
+from __future__ import annotations
+
+import numpy
 
 import pycuda.driver as cuda
-import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
-import numpy
+
 
 # Define block size and number of elements per thread
 block_size = 512
@@ -106,7 +108,7 @@ grid = (amount // multiple_block_size, 1)
 
 # Warmup
 warmup = 2
-for i in range(warmup):
+for _i in range(warmup):
     func.prepared_call(grid, block, a_gpu.gpudata, selec_gpu.gpudata,
                        limit, counter_gpu.gpudata)
     counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
@@ -119,7 +121,7 @@ stop = cuda.Event()
 cuda.Context.synchronize()
 start.record()
 count = 10
-for i in range(count):
+for _i in range(count):
     func.prepared_call(grid, block, a_gpu.gpudata, selec_gpu.gpudata,
                        limit, counter_gpu.gpudata)
     counter_gpu = gpuarray.zeros(1, dtype=numpy.int32)
@@ -136,7 +138,6 @@ elems_in_selec = len(numpy.nonzero(selec >= 0))
 elapsed_seconds = stop.time_since(start) * 1e-3
 print("mem bw:", (a.nbytes + elems_in_selec * 4) / elapsed_seconds / 1e9 * count)
 
-filtered_set = sorted(list(item for item in selec if item != -1))
-reference_set = sorted(list(i for i, x in enumerate(a) if x >= limit))
+filtered_set = sorted(item for item in selec if item != -1)
+reference_set = sorted(i for i, x in enumerate(a) if x >= limit)
 assert filtered_set == reference_set
-
diff --git a/examples/from-wiki/simple_rgb2gray.py b/examples/from-wiki/simple_rgb2gray.py
index 9d4b155723adc798a70f5bb61c40d23b11d80bc8..125808100806e77cc5e54c54568359254fff681f 100644
--- a/examples/from-wiki/simple_rgb2gray.py
+++ b/examples/from-wiki/simple_rgb2gray.py
@@ -1,17 +1,18 @@
-#!python 
+#!python
+from __future__ import annotations
 
-__author__ = 'ashwin'
 
-import pycuda.driver as drv
-import pycuda.tools
-import pycuda.autoinit
-from pycuda.compiler import SourceModule
+__author__ = "ashwin"
+
+import matplotlib.pyplot as p
 import numpy as np
 import scipy.misc as scm
-import matplotlib.pyplot as p
 
-mod = SourceModule \
-    (
+import pycuda.driver as drv
+from pycuda.compiler import SourceModule
+
+
+mod = SourceModule(
         """
 #include<stdio.h>
 #define INDEX(a, b) a*256+b
@@ -32,21 +33,18 @@ dest[INDEX(a, b)] = (0.299*r_img[INDEX(a, b)]+0.587*g_img[INDEX(a, b)]+0.114*b_i
 """
     )
 
-a = scm.imread('Lenna.png').astype(np.float32)
+a = scm.imread("Lenna.png").astype(np.float32)
 print(a)
-r_img = a[:, :, 0].reshape(65536, order='F')
-g_img = a[:, :, 1].reshape(65536, order='F')
-b_img = a[:, :, 2].reshape(65536, order='F')
-dest=r_img
+r_img = a[:, :, 0].reshape(65536, order="F")
+g_img = a[:, :, 1].reshape(65536, order="F")
+b_img = a[:, :, 2].reshape(65536, order="F")
+dest = r_img
 print(dest)
 rgb2gray = mod.get_function("rgb2gray")
-rgb2gray(drv.Out(dest), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(64, 1, 1))
+rgb2gray(drv.Out(dest), drv.In(r_img), drv.In(g_img), drv.In(b_img), block=(1024, 1, 1), grid=(64, 1, 1))
 
-dest=np.reshape(dest,(256,256), order='F')
+dest = np.reshape(dest, (256, 256), order="F")
 
 
 p.imshow(dest)
 p.show()
-
-
-
diff --git a/examples/from-wiki/simple_speed_test.py b/examples/from-wiki/simple_speed_test.py
index aadefd2bb6ad791a9858f270a1ea148ec51976f3..6f603628d60dbf92e80d0472b113dd9335e94b55 100644
--- a/examples/from-wiki/simple_speed_test.py
+++ b/examples/from-wiki/simple_speed_test.py
@@ -1,4 +1,4 @@
-#!python 
+#!python
 # SimpleSpeedTest.py
 
 # Very simple speed testing code
@@ -24,7 +24,7 @@
 # CPU time and first three results:
 # 32.933660s, [ 0.005477  0.005477  0.005477]
 #
-# 
+#
 # Using Win 7 x64, GTX 470 GPU, X5650 Xeon,
 # Driver v301.42, CUDA 4.2, Python 2.7 x64,
 # PyCuda 2012.1 gave the following results:
@@ -41,17 +41,19 @@
 # 8.421861s, [ 0.005477  0.005477  0.005477]
 # CPU time measured using :
 # 5.905661s, [ 0.005477  0.005477  0.005477]
+from __future__ import annotations
 
+import numpy
 
+import pycuda.autoinit
+import pycuda.cumath
 import pycuda.driver as drv
+import pycuda.gpuarray as gpuarray
 import pycuda.tools
-import pycuda.autoinit
-import numpy
 from pycuda.compiler import SourceModule
-import pycuda.gpuarray as gpuarray
-import pycuda.cumath
 from pycuda.elementwise import ElementwiseKernel
 
+
 blocks = 64
 block_size = 128
 nbr_values = blocks * block_size
@@ -89,9 +91,9 @@ a = numpy.ones(nbr_values).astype(numpy.float32)
 # create a destination array that will receive the result
 dest = numpy.zeros_like(a)
 
-start.record() # start timing
-gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
-end.record() # end timing
+start.record()  # start timing
+gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks, 1), block=(block_size, 1, 1))
+end.record()  # end timing
 # calculate the run length
 end.synchronize()
 secs = start.time_till(end)*1e-3
@@ -109,9 +111,9 @@ kernel = ElementwiseKernel(
 
 a = numpy.ones(nbr_values).astype(numpy.float32)
 a_gpu = gpuarray.to_gpu(a)
-start.record() # start timing
-kernel(a_gpu, numpy.int(n_iter))
-end.record() # end timing
+start.record()  # start timing
+kernel(a_gpu, int(n_iter))
+end.record()  # end timing
 # calculate the run length
 end.synchronize()
 secs = start.time_till(end)*1e-3
@@ -129,10 +131,10 @@ kernel = ElementwiseKernel(
 
 a = numpy.ones(nbr_values).astype(numpy.float32)
 a_gpu = gpuarray.to_gpu(a)
-start.record() # start timing
-for i in range(n_iter):
+start.record()  # start timing
+for _i in range(n_iter):
     kernel(a_gpu)
-end.record() # end timing
+end.record()  # end timing
 # calculate the run length
 end.synchronize()
 secs = start.time_till(end)*1e-3
@@ -146,10 +148,10 @@ print("%fs, %s" % (secs, str(a_gpu.get()[:3])))
 
 a = numpy.ones(nbr_values).astype(numpy.float32)
 a_gpu = gpuarray.to_gpu(a)
-start.record() # start timing
-for i in range(n_iter):
+start.record()  # start timing
+for _i in range(n_iter):
     a_gpu = pycuda.cumath.sin(a_gpu)
-end.record() # end timing
+end.record()  # end timing
 # calculate the run length
 end.synchronize()
 secs = start.time_till(end)*1e-3
@@ -162,16 +164,15 @@ print("%fs, %s" % (secs, str(a_gpu.get()[:3])))
 # use numpy the calculate the result on the CPU for reference
 
 a = numpy.ones(nbr_values).astype(numpy.float32)
-start.record() # start timing
+start.record()  # start timing
 start.synchronize()
 
-for i in range(n_iter):
+for _i in range(n_iter):
     a = numpy.sin(a)
 
-end.record() # end timing
+end.record()  # end timing
 # calculate the run length
 end.synchronize()
 secs = start.time_till(end)*1e-3
 print("CPU time and first three results:")
 print("%fs, %s" % (secs, str(a[:3])))
-
diff --git a/examples/from-wiki/sobel_filter.py b/examples/from-wiki/sobel_filter.py
index f5ad622ffce30ea892bfaeec98b7d6f33a37de32..b3c51cce0d42ce7731d4a44ed5b5b04cd453d471 100644
--- a/examples/from-wiki/sobel_filter.py
+++ b/examples/from-wiki/sobel_filter.py
@@ -1,6 +1,6 @@
-#!python 
-#!/usr/bin/env python
-#-*- coding: utf-8 -*-
+#!python
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
 #
 # Requires PyCuda, PyOpenGL, and Pil
 # MAKE SURE YOU HAVE AN UPDATED VERSION OF THESE PACKAGES!!
@@ -20,19 +20,27 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 #
+from __future__ import annotations
 
+import os
+import sys
+import time
+
+import Image
+import numpy as np
 from OpenGL.GL import *
-from OpenGL.GLUT import *
-from OpenGL.GLU import *
 from OpenGL.GL.ARB.vertex_buffer_object import *
-import numpy as np, Image
-import sys, time, os
+from OpenGL.GLU import *
+from OpenGL.GLUT import *
+
+import pycuda
 import pycuda.driver as cuda_driver
 import pycuda.gl as cuda_gl
-import pycuda
-#import pycuda.gl.autoinit
+
+# import pycuda.gl.autoinit
 from pycuda.compiler import SourceModule
 
+
 imWidth = 0
 imHeight = 0
 wWidth = 0
@@ -52,6 +60,7 @@ fpsLimit = 8
 timer = 0.0
 ver2011 = False
 
+
 def copy2D_array_to_device(dst, src, type_sz, width, height):
     copy = cuda_driver.Memcpy2D()
     copy.set_src_array(src)
@@ -60,6 +69,7 @@ def copy2D_array_to_device(dst, src, type_sz, width, height):
     copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width*type_sz
     copy(aligned=True)
 
+
 def computeFPS():
     global frameCount, fpsCount, fpsLimit, timer
     frameCount += 1
@@ -69,6 +79,7 @@ def computeFPS():
         glutSetWindowTitle("Cuda Edge Detection: %f fps" % ifps)
         fpsCount = 0
 
+
 def sobelFilter(odata, iw, ih):
     global array, pixels, mode, scale
     if mode == 3:
@@ -369,27 +380,28 @@ def sobelFilter(odata, iw, ih):
         # fixed BlockSize Launch
         RADIUS = 1
         threads = (16, 4, 1)
-        BlockWidth = 80 # Do not change!
-        blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
-                               ih/threads[1]+(0!=ih%threads[1]) )
-        SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
-        sharedMem = SharedPitch*(threads[1]+2*RADIUS);
+        BlockWidth = 80  # Do not change!
+        blocks = (iw/(4*BlockWidth)+(iw % (4*BlockWidth) != 0),
+                               ih/threads[1]+(ih % threads[1] != 0))
+        SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f)
+        sharedMem = SharedPitch*(threads[1]+2*RADIUS)
         iw = iw & ~3
-        cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
+        cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref], block=threads, grid=blocks, shared=sharedMem)
     elif mode == 2:
         # variable BlockSize launch
         RADIUS = 1
         threads = (16, 4, 1)
-        BlockWidth = 80 # Change only with divisible by 16 values!
-        blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
-                               ih/threads[1]+(0!=ih%threads[1]) )
-        SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f);
-        sharedMem = SharedPitch*(threads[1]+2*RADIUS);
+        BlockWidth = 80  # Change only with divisible by 16 values!
+        blocks = (iw/(4*BlockWidth)+(iw % (4*BlockWidth) != 0),
+                               ih/threads[1]+(ih % threads[1] != 0))
+        SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f)
+        sharedMem = SharedPitch*(threads[1]+2*RADIUS)
         iw = iw & ~3
-        cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem)
+        cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref], block=threads, grid=blocks, shared=sharedMem)
     else:
         BlockWidth = 384
-        cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref],block=(BlockWidth,1,1),grid=(ih,1))
+        cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref], block=(BlockWidth, 1, 1), grid=(ih, 1))
+
 
 def initGL():
     global wWidth, wHeight, wName
@@ -397,72 +409,72 @@ def initGL():
     glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA)
     glutInitWindowSize(wWidth, wHeight)
     glutCreateWindow(wName)
-    import pycuda.gl.autoinit
+
 
 def loadImage(fn=None):
     global pixels, imWidth, imHeight, wWidth, wHeight
     try:
-        im = Image.open(fn) # Open the image
-    except IOError:
+        im = Image.open(fn)  # Open the image
+    except OSError:
         print("Usage:", os.path.basename(sys.argv[0]), "[IMAGE=defaultimage.jpg]")
         print("Can't open", fn)
         sys.exit(1)
-    imWidth, imHeight = im.size # Window size is set to image size
+    imWidth, imHeight = im.size  # Window size is set to image size
     wWidth, wHeight = im.size
-    im.draft("L", im.size) # L-flag is for Luminance
-    pixels = np.fromstring(im.tostring(), dtype=np.uint8) # Got the array
-    pixels.resize((imHeight, imWidth)) # Resize to 2d array
+    im.draft("L", im.size)  # L-flag is for Luminance
+    pixels = np.fromstring(im.tostring(), dtype=np.uint8)  # Got the array
+    pixels.resize((imHeight, imWidth))  # Resize to 2d array
     print("Reading image:", fn, "size:", imWidth, "x", imHeight)
 
+
 def initData(fn=None):
     global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid
 
     # Cuda array initialization
-    array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major
+    array = cuda_driver.matrix_to_array(pixels, "C")  # C-style instead of Fortran-style: row-major
 
-    pixels.fill(0) # Resetting the array to 0
+    pixels.fill(0)  # Resetting the array to 0
 
-    pbo_buffer = glGenBuffers(1) # generate 1 buffer reference
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer
-    glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer
-    bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size
-    assert(bsize == imWidth*imHeight)
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
+    pbo_buffer = glGenBuffers(1)  # generate 1 buffer reference
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer)  # binding to this buffer
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW)  # Allocate the buffer
+    bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE)  # Check allocated buffer size
+    assert (bsize == imWidth*imHeight)
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)  # Unbind
 
     if ver2011:
         cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD)
     else:
-        cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource
-
+        cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer))  # Mapping GLBuffer to cuda_resource
 
-    glGenTextures(1, texid); # generate 1 texture reference
-    glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture
-    glBindTexture(GL_TEXTURE_2D, 0) # Unbind
+    glGenTextures(1, texid)  # generate 1 texture reference
+    glBindTexture(GL_TEXTURE_2D, texid)  # binding to this texture
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None)  # Allocate the texture
+    glBindTexture(GL_TEXTURE_2D, 0)  # Unbind
 
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment
-    glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment
+    glPixelStorei(GL_UNPACK_ALIGNMENT, 1)  # 1-byte row alignment
+    glPixelStorei(GL_PACK_ALIGNMENT, 1)  # 1-byte row alignment
 
 
 def display():
     global cuda_pbo_resource, pbo_buffer, texid, imWidth, imHeight, timer
 
-    timer = time.time() # Starting timer
-    mapping_obj = cuda_pbo_resource.map() # Map the GlBuffer
+    timer = time.time()  # Starting timer
+    mapping_obj = cuda_pbo_resource.map()  # Map the GlBuffer
     if ver2011:
-        data, sz = mapping_obj.device_ptr_and_size() # Got the CUDA pointer to GlBuffer
+        data, _sz = mapping_obj.device_ptr_and_size()  # Got the CUDA pointer to GlBuffer
     else:
         data = mapping_obj.device_ptr()
-    sobelFilter(data, imWidth, imHeight) # Writing to "data"
-    mapping_obj.unmap() # Unmap the GlBuffer
+    sobelFilter(data, imWidth, imHeight)  # Writing to "data"
+    mapping_obj.unmap()  # Unmap the GlBuffer
 
-    glClear(GL_COLOR_BUFFER_BIT) # Clear
+    glClear(GL_COLOR_BUFFER_BIT)  # Clear
     glBindTexture(GL_TEXTURE_2D, texid)
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer)
     # Copyng from buffer to texture
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, None)
-    #glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind
+    # glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)  # Unbind
 
     glDisable(GL_DEPTH_TEST)
     glEnable(GL_TEXTURE_2D)
@@ -487,6 +499,7 @@ def display():
     computeFPS()
     glutPostRedisplay()
 
+
 def reshape(x, y):
     glViewport(0, 0, x, y)
     glMatrixMode(GL_PROJECTION)
@@ -496,32 +509,35 @@ def reshape(x, y):
     glLoadIdentity()
     glutPostRedisplay()
 
+
 def keyboard(key, x=0, y=0):
     global mode, scale
-    if key=="q":
+    if key == "q":
         sys.exit(0)
-    elif key=="I" or key=="i":
+    elif key == "I" or key == "i":
         mode = 0
-    elif key=="T" or key=="t":
+    elif key == "T" or key == "t":
         mode = 1
-    elif key=="S" or key=="s":
+    elif key == "S" or key == "s":
         mode = 2
-    elif key=="D" or key=="d":
+    elif key == "D" or key == "d":
         mode = 3
     elif key == "-":
         scale -= 0.1
     elif key == "=":
         scale += 0.1
 
+
 def idle():
     glutPostRedisplay()
 
+
 def main(argv):
     fn = "defaultimage.jpg"
     if len(argv) > 1:
         fn = argv[1]
 
-    loadImage(fn) # Loading the image
+    loadImage(fn)  # Loading the image
 
     initGL()
     initData(fn)
@@ -540,11 +556,10 @@ def main(argv):
     glutKeyboardFunc(keyboard)
     glutReshapeFunc(reshape)
     glutIdleFunc(idle)
-    glutMainLoop();
+    glutMainLoop()
+
 
 if __name__ == "__main__":
     if pycuda.VERSION[0] >= 2011:
         ver2011 = True
     main(sys.argv)
-
-
diff --git a/examples/from-wiki/sparse_solve.py b/examples/from-wiki/sparse_solve.py
index 243579e0ca21fa757b00215825835c81e9c2975f..38505249a1a613fb3513782647f03a371543680d 100644
--- a/examples/from-wiki/sparse_solve.py
+++ b/examples/from-wiki/sparse_solve.py
@@ -1,11 +1,11 @@
-#!python 
-import pycuda.autoinit
-import pycuda.driver as drv
-import pycuda.gpuarray as gpuarray
+#!python
+from __future__ import annotations
+
 import numpy
 import numpy.linalg as la
 
-
+import pycuda.driver as drv
+import pycuda.gpuarray as gpuarray
 
 
 def main_cg():
@@ -41,7 +41,7 @@ def main_cg():
 
     from pycuda.sparse.cg import solve_pkt_with_cg
     print("start solve")
-    for i in range(4):
+    for _i in range(4):
         start = drv.Event()
         stop = drv.Event()
         start.record()
@@ -64,11 +64,11 @@ def main_cg():
             est_flops += csr_mat.shape[0] * it_count
 
         print("residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs)))
-        print(("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
+        print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
                 "%g gflops/s" % (
                     csr_mat.shape[0],
                     elapsed, it_count, res_count, it_count/elapsed,
-                    est_flops/elapsed/1e9)))
+                    est_flops/elapsed/1e9))
 
     # TODO: mixed precision
     # TODO: benchmark
@@ -76,12 +76,6 @@ def main_cg():
     dev_pool.stop_holding()
 
 
-
-
-
 if __name__ == "__main__":
     print("starting...")
     main_cg()
-
-
-
diff --git a/examples/from-wiki/threads_and_blocks.py b/examples/from-wiki/threads_and_blocks.py
index caa851f30a3424f42ad0b99da77abcb0e926d812..b94caa0b6f913d60d74ec74ab0d95c5e3b5b9334 100644
--- a/examples/from-wiki/threads_and_blocks.py
+++ b/examples/from-wiki/threads_and_blocks.py
@@ -1,8 +1,9 @@
-#!python 
-import pycuda.driver as cuda
-import pycuda.autoinit
+#!python
+from __future__ import annotations
+
 from pycuda.compiler import SourceModule
 
+
 mod = SourceModule("""
     #include <stdio.h>
 
@@ -13,5 +14,4 @@ mod = SourceModule("""
     """)
 
 func = mod.get_function("say_hi")
-func(block=(4,4,1),grid=(2,2,1))
-
+func(block=(4, 4, 1), grid=(2, 2, 1))
diff --git a/examples/from-wiki/thrust_interop.py b/examples/from-wiki/thrust_interop.py
index 78d877a5291e415b8781fb650084ea25e57df466..1f19a0967cf0ab3b1232c17f2e3e61f838efd8f5 100644
--- a/examples/from-wiki/thrust_interop.py
+++ b/examples/from-wiki/thrust_interop.py
@@ -1,93 +1,94 @@
-#!python 
+#!python
+from __future__ import annotations
 
-import pycuda
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
 import numpy as np
-
 from cgen import *
 from codepy.bpl import BoostPythonModule
 from codepy.cuda import CudaModule
 
-#Make a host_module, compiled for CPU
+import pycuda.gpuarray as gpuarray
+
+
+# Make a host_module, compiled for CPU
 host_mod = BoostPythonModule()
 
-#Make a device module, compiled with NVCC
+# Make a device module, compiled with NVCC
 nvcc_mod = CudaModule(host_mod)
 
-#Describe device module code
-#NVCC includes
+# Describe device module code
+# NVCC includes
 nvcc_includes = [
-    'thrust/sort.h',
-    'thrust/device_vector.h',
-    'cuda.h',
+    "thrust/sort.h",
+    "thrust/device_vector.h",
+    "cuda.h",
     ]
-#Add includes to module
+# Add includes to module
 nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes])
 
-#NVCC function
+# NVCC function
 nvcc_function = FunctionBody(
-    FunctionDeclaration(Value('void', 'my_sort'),
-                        [Value('CUdeviceptr', 'input_ptr'),
-                         Value('int', 'length')]),
-    Block([Statement('thrust::device_ptr<float> thrust_ptr((float*)input_ptr)'),
-           Statement('thrust::sort(thrust_ptr, thrust_ptr+length)')]))
-
-#Add declaration to nvcc_mod
-#Adds declaration to host_mod as well
+    FunctionDeclaration(Value("void", "my_sort"),
+                        [Value("CUdeviceptr", "input_ptr"),
+                         Value("int", "length")]),
+    Block([Statement("thrust::device_ptr<float> thrust_ptr((float*)input_ptr)"),
+           Statement("thrust::sort(thrust_ptr, thrust_ptr+length)")]))
+
+# Add declaration to nvcc_mod
+# Adds declaration to host_mod as well
 nvcc_mod.add_function(nvcc_function)
 
 host_includes = [
-    'boost/python/extract.hpp',
+    "boost/python/extract.hpp",
     ]
-#Add host includes to module
+# Add host includes to module
 host_mod.add_to_preamble([Include(x) for x in host_includes])
 
 host_namespaces = [
-    'using namespace boost::python',
+    "using namespace boost::python",
     ]
 
-#Add BPL using statement
+# Add BPL using statement
 host_mod.add_to_preamble([Statement(x) for x in host_namespaces])
 
 
 host_statements = [
-    #Extract information from PyCUDA GPUArray
-    #Get length
+    # Extract information from PyCUDA GPUArray
+    # Get length
     'tuple shape = extract<tuple>(gpu_array.attr("shape"))',
-    'int length = extract<int>(shape[0])',
-    #Get data pointer
+    "int length = extract<int>(shape[0])",
+    # Get data pointer
     'CUdeviceptr ptr = extract<CUdeviceptr>(gpu_array.attr("ptr"))',
-    #Call Thrust routine, compiled into the CudaModule
-    'my_sort(ptr, length)',
-    #Return result
-    'return gpu_array',
+    # Call Thrust routine, compiled into the CudaModule
+    "my_sort(ptr, length)",
+    # Return result
+    "return gpu_array",
     ]
 
 host_mod.add_function(
     FunctionBody(
-        FunctionDeclaration(Value('object', 'host_entry'),
-                            [Value('object', 'gpu_array')]),
+        FunctionDeclaration(Value("object", "host_entry"),
+                            [Value("object", "gpu_array")]),
         Block([Statement(x) for x in host_statements])))
 
-#Print out generated code, to see what we're actually compiling
+# Print out generated code, to see what we're actually compiling
 print("---------------------- Host code ----------------------")
-print((host_mod.generate()))
+print(host_mod.generate())
 print("--------------------- Device code ---------------------")
-print((nvcc_mod.generate()))
+print(nvcc_mod.generate())
 print("-------------------------------------------------------")
 
 
+# Compile modules
+import codepy.jit
+import codepy.toolchain
+
 
-#Compile modules
-import codepy.jit, codepy.toolchain
 gcc_toolchain = codepy.toolchain.guess_toolchain()
 nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()
 
 module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=True)
 
 
-
 length = 100
 a = np.array(np.random.rand(length), dtype=np.float32)
 print("---------------------- Unsorted -----------------------")
@@ -98,4 +99,3 @@ c = module.host_entry(b)
 print("----------------------- Sorted ------------------------")
 print(c.get())
 print("-------------------------------------------------------")
-
diff --git a/examples/from-wiki/using_printf.py b/examples/from-wiki/using_printf.py
index c12cfa42717c16c193af9989995c15e9d1f78674..797c58a63fa7d8d4d8acd9515fa10aaa379d1b78 100644
--- a/examples/from-wiki/using_printf.py
+++ b/examples/from-wiki/using_printf.py
@@ -1,8 +1,9 @@
-#!python 
-import pycuda.driver as cuda
-import pycuda.autoinit
+#!python
+from __future__ import annotations
+
 from pycuda.compiler import SourceModule
 
+
 mod = SourceModule("""
     #include <stdio.h>
 
@@ -13,5 +14,4 @@ mod = SourceModule("""
     """)
 
 func = mod.get_function("say_hi")
-func(block=(4,4,1))
-
+func(block=(4, 4, 1))
diff --git a/examples/hello_gpu.py b/examples/hello_gpu.py
index 23c51adba399ebf5b7b2ccecd2c295df17f496a4..d19fb09f4c33d9de7fd46da55ab7d3308140b81f 100644
--- a/examples/hello_gpu.py
+++ b/examples/hello_gpu.py
@@ -1,10 +1,11 @@
-import pycuda.driver as drv
-import pycuda.tools
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
-import numpy.linalg as la
+
+import pycuda.driver as drv
 from pycuda.compiler import SourceModule
 
+
 mod = SourceModule("""
 __global__ void multiply_them(float *dest, float *a, float *b)
 {
@@ -21,6 +22,6 @@ b = numpy.random.randn(400).astype(numpy.float32)
 dest = numpy.zeros_like(a)
 multiply_them(
         drv.Out(dest), drv.In(a), drv.In(b),
-        block=(400,1,1))
+        block=(400, 1, 1))
 
 print(dest-a*b)
diff --git a/pycuda/__init__.py b/pycuda/__init__.py
index 5ace743eb79ffafe2241db589eb6c1044125c671..3a00359e8688074aa6cd5f8614d3f398d509fd32 100644
--- a/pycuda/__init__.py
+++ b/pycuda/__init__.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 VERSION = (2025, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/pycuda/_cluda.py b/pycuda/_cluda.py
index fa5eda2a94fc849eec0870b312ee94eab4851e76..ff738fa35c1b027a4231ad4246813ca68211dda2 100644
--- a/pycuda/_cluda.py
+++ b/pycuda/_cluda.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 CLUDA_PREAMBLE = """
 #define local_barrier() __syncthreads();
 
diff --git a/pycuda/_mymako.py b/pycuda/_mymako.py
index 826dde6198fd5c48746c312ed313091e78845633..37070b5fbdd18c9518bcb9133b16c84858915495 100644
--- a/pycuda/_mymako.py
+++ b/pycuda/_mymako.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 try:
     import mako.template  # noqa: F401
 except ImportError:
@@ -13,4 +16,4 @@ except ImportError:
     )
 
 
-from mako import *  # noqa: F403, F401
+from mako import *  # noqa: F403
diff --git a/pycuda/autoinit.py b/pycuda/autoinit.py
index 664ed1c8e1725c46c2adcb17f5f15bcfc5ffe7eb..b628b3cd057ea95b8f8af1cb2923818e7e49c27e 100644
--- a/pycuda/autoinit.py
+++ b/pycuda/autoinit.py
@@ -1,10 +1,15 @@
-import pycuda.driver as cuda
+from __future__ import annotations
+
 import atexit
 
+import pycuda.driver as cuda
+
+
 # Initialize CUDA
 cuda.init()
 
-from pycuda.tools import make_default_context  # noqa: E402
+from pycuda.tools import make_default_context
+
 
 global context
 context = make_default_context()
diff --git a/pycuda/autoprimaryctx.py b/pycuda/autoprimaryctx.py
index 537c8610261ba97c07605377297b103ffe2855ea..5a860b734f90ff38823716bdd4a9afbff5504125 100644
--- a/pycuda/autoprimaryctx.py
+++ b/pycuda/autoprimaryctx.py
@@ -1,10 +1,14 @@
-import pycuda.driver as cuda
+from __future__ import annotations
+
 import atexit
 
+import pycuda.driver as cuda
+
+
 # Initialize CUDA
 cuda.init()
 
-from pycuda.tools import make_default_context  # noqa: E402
+from pycuda.tools import make_default_context
 
 
 def _retain_primary_context(dev):
diff --git a/pycuda/characterize.py b/pycuda/characterize.py
index 3b46ff7aeac1a2e169968258b4e3e1a27351ced8..74e6c9c71611d9f313372f852c41843ae5789024 100644
--- a/pycuda/characterize.py
+++ b/pycuda/characterize.py
@@ -1,6 +1,9 @@
-from pycuda.tools import context_dependent_memoize
+from __future__ import annotations
+
 import numpy as np
 
+from pycuda.tools import context_dependent_memoize
+
 
 def platform_bits():
     import sys
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 6459e1573fa77bd8e2cd81c6c819936047f5b488..d4edd6537a8f5c2a7749af59755b64b5ccdc6011 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -1,24 +1,24 @@
-from pytools import memoize
-
 # don't import pycuda.driver here--you'll create an import loop
-import os
+from __future__ import annotations
 
+import os
 import sys
-from tempfile import mkstemp
 from os import unlink
+from tempfile import mkstemp
 
+from pytools import memoize
 from pytools.prefork import call_capture_output
 
 
 @memoize
 def get_nvcc_version(nvcc):
     cmdline = [nvcc, "--version"]
-    result, stdout, stderr = call_capture_output(cmdline)
+    result, stdout, _stderr = call_capture_output(cmdline)
 
     if result != 0 or not stdout:
         from warnings import warn
 
-        warn("NVCC version could not be determined.")
+        warn("NVCC version could not be determined.", stacklevel=2)
         stdout = b"nvcc unknown version"
 
     return stdout.decode("utf-8", "replace")
@@ -39,12 +39,12 @@ def _new_md5():
 def preprocess_source(source, options, nvcc):
     handle, source_path = mkstemp(suffix=".cu")
 
-    outf = open(source_path, "w")
-    outf.write(source)
-    outf.close()
+    with open(source_path, "w") as outf:
+        outf.write(source)
+
     os.close(handle)
 
-    cmdline = [nvcc, "--preprocess"] + options + [source_path]
+    cmdline = [nvcc, "--preprocess", *options, source_path]
     if "win32" in sys.platform:
         cmdline.extend(["--compiler-options", "-EP"])
     else:
@@ -102,11 +102,8 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
         cache_path = join(cache_dir, cache_file + "." + target)
 
         try:
-            cache_file = open(cache_path, "rb")
-            try:
+            with open(cache_path, "rb") as cache_file:
                 return cache_file.read()
-            finally:
-                cache_file.close()
 
         except Exception:
             pass
@@ -119,9 +116,8 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
     cu_file_name = file_root + ".cu"
     cu_file_path = join(file_dir, cu_file_name)
 
-    outf = open(cu_file_path, "w")
-    outf.write(str(source))
-    outf.close()
+    with open(cu_file_path, "w") as outf:
+        outf.write(str(source))
 
     if keep:
         options = options[:]
@@ -129,13 +125,13 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
 
         print("*** compiler output in %s" % file_dir)
 
-    cmdline = [nvcc, "--" + target] + options + [cu_file_name]
+    cmdline = [nvcc, "--" + target, *options, cu_file_name]
     result, stdout, stderr = call_capture_output(
         cmdline, cwd=file_dir, error_on_nonzero=False
     )
 
     try:
-        result_f = open(join(file_dir, file_root + "." + target), "rb")
+        result_f = open(join(file_dir, file_root + "." + target), "rb")  # noqa: SIM115
     except OSError:
         no_output = True
     else:
@@ -147,7 +143,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
 
             warn(
                 "PyCUDA: nvcc exited with status 0, but appears to have "
-                "encountered an error"
+                "encountered an error", stacklevel=2
             )
         from pycuda.driver import CompileError
 
@@ -178,12 +174,11 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"):
     result_f.close()
 
     if cache_dir:
-        outf = open(cache_path, "wb")
-        outf.write(result_data)
-        outf.close()
+        with open(cache_path, "wb") as outf:
+            outf.write(result_data)
 
     if not keep:
-        from os import listdir, unlink, rmdir
+        from os import listdir, rmdir, unlink
 
         for name in listdir(file_dir):
             unlink(join(file_dir, name))
@@ -230,10 +225,12 @@ def compile(
     arch=None,
     code=None,
     cache_dir=None,
-    include_dirs=[],
+    include_dirs=None,
     target="cubin",
 ):
 
+    if include_dirs is None:
+        include_dirs = []
     assert target in ["cubin", "ptx", "fatbin"]
 
     if not no_extern_c:
@@ -282,14 +279,15 @@ def compile(
     if code is not None:
         options.extend(["-code", code])
 
-    if "darwin" in sys.platform and sys.maxsize == 9223372036854775807:
-        options.append("-m64")
-    elif "win32" in sys.platform and sys.maxsize == 9223372036854775807:
+    if (
+            ("darwin" in sys.platform and sys.maxsize == 9223372036854775807)
+            or
+            ("win32" in sys.platform and sys.maxsize == 9223372036854775807)):
         options.append("-m64")
     elif "win32" in sys.platform and sys.maxsize == 2147483647:
         options.append("-m32")
 
-    include_dirs = include_dirs + [_find_pycuda_include_path()]
+    include_dirs = [*include_dirs, _find_pycuda_include_path()]
 
     for i in include_dirs:
         options.append("-I" + i)
@@ -310,7 +308,7 @@ class CudaModule:
 
                 warn(
                     "trying to compile for a compute capability "
-                    "higher than selected GPU"
+                    "higher than selected GPU", stacklevel=2
                 )
         except Exception:
             pass
@@ -341,8 +339,10 @@ class SourceModule(CudaModule):
         arch=None,
         code=None,
         cache_dir=None,
-        include_dirs=[],
+        include_dirs=None,
     ):
+        if include_dirs is None:
+            include_dirs = []
         self._check_arch(arch)
 
         cubin = compile(
@@ -368,8 +368,8 @@ def _search_on_path(filenames):
     """Find file on system path."""
     # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
 
-    from os.path import exists, abspath, join
-    from os import pathsep, environ
+    from os import environ, pathsep
+    from os.path import abspath, exists, join
 
     search_path = environ["PATH"]
 
@@ -400,13 +400,15 @@ class DynamicModule(CudaModule):
         arch=None,
         code=None,
         cache_dir=None,
-        include_dirs=[],
+        include_dirs=None,
         message_handler=None,
         log_verbose=False,
         cuda_libdir=None,
     ):
         from pycuda.driver import Context
 
+        if include_dirs is None:
+            include_dirs = []
         compute_capability = Context.get_device().compute_capability()
         if compute_capability < (3, 5):
             raise Exception(
@@ -563,9 +565,11 @@ class DynamicSourceModule(DynamicModule):
         arch=None,
         code=None,
         cache_dir=None,
-        include_dirs=[],
+        include_dirs=None,
         cuda_libdir=None,
     ):
+        if include_dirs is None:
+            include_dirs = []
         super().__init__(
             nvcc=nvcc,
             link_options=None,
diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index ab2b32c86079b05af501e0eebeabacd7e181d2e7..4ac71d587bdd90b30a2659869b2a709b24f6fd9a 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -1,7 +1,11 @@
-import pycuda.gpuarray as gpuarray
-import pycuda.elementwise as elementwise
-import numpy as np
+from __future__ import annotations
+
 import warnings
+
+import numpy as np
+
+import pycuda.elementwise as elementwise
+import pycuda.gpuarray as gpuarray
 from pycuda.driver import Stream
 
 
@@ -10,7 +14,7 @@ def _make_unary_array_func(name):
 
         if stream_or_out is not None:
             warnings.warn(
-                "please use 'out' or 'stream' keyword arguments", DeprecationWarning
+                "please use 'out' or 'stream' keyword arguments", DeprecationWarning, stacklevel=2
             )
             if isinstance(stream_or_out, Stream):
                 stream = stream_or_out
@@ -25,10 +29,7 @@ def _make_unary_array_func(name):
         if "stream" in kwargs:
             stream = kwargs["stream"]
 
-        if array.dtype == np.float32:
-            func_name = name + "f"
-        else:
-            func_name = name
+        func_name = name + "f" if array.dtype == np.float32 else name
 
         if not array.flags.forc:
             raise RuntimeError(
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 6dbbeede174017b0d79b7cf03ed78c088b2fd7cc..4382b888176e2a6597a7130202d36a6a83d7ab95 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -1,8 +1,12 @@
+from __future__ import annotations
+
 import numpy as np
+
+from pytools import memoize_method
+
 import pycuda.compiler
 import pycuda.driver as drv
 import pycuda.gpuarray as array
-from pytools import memoize_method
 
 
 # {{{ MD5-based random number generation
@@ -182,8 +186,8 @@ md5_code = """
 
 
 def rand(shape, dtype=np.float32, stream=None):
-    from pycuda.gpuarray import GPUArray
     from pycuda.elementwise import get_elwise_kernel
+    from pycuda.gpuarray import GPUArray
 
     result = GPUArray(shape, dtype)
 
@@ -515,12 +519,12 @@ class _RandomNumberGeneratorBase:
         self.module = module = pycuda.compiler.SourceModule(source, no_extern_c=True)
 
         self.generators = {}
-        for name, out_type, suffix in my_generators:
+        for name, _out_type, _suffix in my_generators:
             gen_func = module.get_function(name)
             gen_func.prepare("PPn")
             self.generators[name] = gen_func
         if get_curand_version() >= (4, 0, 0):
-            for name, in_type, out_type, suffix in my_log_generators:
+            for name, in_type, _out_type, _suffix in my_log_generators:
                 gen_func = module.get_function(name)
                 if in_type == "float":
                     gen_func.prepare("PPffn")
@@ -528,11 +532,11 @@ class _RandomNumberGeneratorBase:
                     gen_func.prepare("PPddn")
                 self.generators[name] = gen_func
         if get_curand_version() >= (5, 0, 0):
-            for name, out_type, suffix in my_poisson_generators:
+            for name, _out_type, _suffix in my_poisson_generators:
                 gen_func = module.get_function(name)
                 gen_func.prepare("PPdn")
                 self.generators[name] = gen_func
-            for name, inout_type, suffix in my_poisson_inplace_generators:
+            for name, _inout_type, _suffix in my_poisson_inplace_generators:
                 gen_func = module.get_function(name)
                 gen_func.prepare("PPn")
                 self.generators[name] = gen_func
@@ -553,10 +557,7 @@ class _RandomNumberGeneratorBase:
         self.skip_ahead_array.prepare("PnP")
 
     def _kernels(self):
-        return list(self.generators.values()) + [
-            self.skip_ahead,
-            self.skip_ahead_array,
-        ]
+        return [*list(self.generators.values()), self.skip_ahead, self.skip_ahead_array]
 
     @property
     @memoize_method
@@ -831,12 +832,7 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
 
     def _kernels(self):
         return (
-            _RandomNumberGeneratorBase._kernels(self)
-            + [self.module.get_function("prepare")]
-            + [
-                self.module.get_function("skip_ahead_sequence"),
-                self.module.get_function("skip_ahead_sequence_array"),
-            ]
+            [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare"), self.module.get_function("skip_ahead_sequence"), self.module.get_function("skip_ahead_sequence_array")]
         )
 
 
@@ -1008,10 +1004,7 @@ if get_curand_version() >= (4, 1, 0):
             )
 
         def _kernels(self):
-            return _PseudoRandomNumberGeneratorBase._kernels(self) + [
-                self.module.get_function("skip_ahead_subsequence"),
-                self.module.get_function("skip_ahead_subsequence_array"),
-            ]
+            return [*_PseudoRandomNumberGeneratorBase._kernels(self), self.module.get_function("skip_ahead_subsequence"), self.module.get_function("skip_ahead_subsequence_array")]
 
 
 # }}}
@@ -1128,9 +1121,7 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
                 drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
 
     def _kernels(self):
-        return _RandomNumberGeneratorBase._kernels(self) + [
-            self.module.get_function("prepare")
-        ]
+        return [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare")]
 
 
 scrambledsobol_random_source = """
@@ -1230,9 +1221,7 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
                 drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
 
     def _kernels(self):
-        return _RandomNumberGeneratorBase._kernels(self) + [
-            self.module.get_function("prepare")
-        ]
+        return [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare")]
 
 
 if get_curand_version() >= (3, 2, 0):
diff --git a/pycuda/debug.py b/pycuda/debug.py
index dee5974c93efc3c5d239df2782abfe3f0aed9071..f2973b0832ac5c51b5c62e690ac263bf239665fe 100644
--- a/pycuda/debug.py
+++ b/pycuda/debug.py
@@ -1,9 +1,12 @@
-import pycuda.driver
+from __future__ import annotations
 
 import sys
 from optparse import OptionParser
 from os.path import exists
 
+import pycuda.driver
+
+
 pycuda.driver.set_debugging()
 
 parser = OptionParser(usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]")
@@ -23,4 +26,5 @@ if not exists(mainpyfile):
 
 sys.argv = args
 
-exec(compile(open(mainpyfile).read(), mainpyfile, "exec"))
+with open(mainpyfile) as mainpy:
+    exec(compile(mainpy.read(), mainpyfile, "exec"))
diff --git a/pycuda/driver.py b/pycuda/driver.py
index 4bce8347a486d853ab989b3181b8e77053ae0fd7..b5070c9ca3016186201f88beec7dcb1ae1af2cc4 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -1,9 +1,13 @@
+from __future__ import annotations
+
+
 __copyright__ = """
 Copyright 2008-2021 Andreas Kloeckner
 Copyright 2021 NVIDIA Corporation
 """
 
 import os
+
 import numpy as np
 
 
@@ -14,8 +18,8 @@ def _search_on_path(filenames):
     """Find file on system path."""
     # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
 
-    from os.path import exists, abspath, join
-    from os import pathsep, environ
+    from os import environ, pathsep
+    from os.path import abspath, exists, join
 
     search_path = environ["PATH"]
 
@@ -27,7 +31,7 @@ def _search_on_path(filenames):
 
 
 def _add_cuda_libdir_to_dll_path():
-    from os.path import join, dirname
+    from os.path import dirname, join
 
     cuda_path = os.environ.get("CUDA_PATH")
 
@@ -46,12 +50,12 @@ def _add_cuda_libdir_to_dll_path():
         "Unable to discover CUDA installation directory "
         "while attempting to add it to Python's DLL path. "
         "Either set the 'CUDA_PATH' environment variable "
-        "or ensure that 'nvcc.exe' is on the path."
+        "or ensure that 'nvcc.exe' is on the path.", stacklevel=2
     )
 
 
 try:
-    os.add_dll_directory
+    os.add_dll_directory  # noqa: B018
 except AttributeError:
     # likely not on Py3.8 and Windows
     # https://github.com/inducer/pycuda/issues/213
@@ -71,7 +75,7 @@ except ImportError as e:
         warn(
             "Failed to import the CUDA driver interface, with an error "
             "message indicating that the version of your CUDA header "
-            "does not match the version of your CUDA driver."
+            "does not match the version of your CUDA driver.", stacklevel=2
         )
     raise
 
@@ -131,7 +135,8 @@ class ArgumentHandler:
                 self.dev_alloc = mem_alloc_like(self.array)
             except AttributeError:
                 raise TypeError(
-                    "could not determine array length of '%s': unsupported array type or not an array"
+                    "could not determine array length of '%s': "
+                    "unsupported array type or not an array"
                     % type(self.array)
                 )
         return self.dev_alloc
@@ -177,7 +182,7 @@ def _add_functionality():
 
                 warn(
                     "CUDA driver raised '%s' when querying '%s' on '%s'"
-                    % (e, att_name, dev)
+                    % (e, att_name, dev), stacklevel=2
                 )
             else:
                 result[att_id] = att_value
@@ -305,9 +310,11 @@ def _add_functionality():
                 for handler in post_handlers:
                     handler.post_call(stream)
 
-    def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=[]):
+    def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=None):
         from warnings import warn
 
+        if texrefs is None:
+            texrefs = []
         if block is not None:
             warn(
                 "setting the block size in Function.prepare is deprecated",
@@ -328,7 +335,7 @@ def _add_functionality():
 
         func.arg_format = ""
 
-        for i, arg_type in enumerate(arg_types):
+        for _i, arg_type in enumerate(arg_types):
             if (
                 isinstance(arg_type, type)
                 and np is not None
@@ -358,7 +365,7 @@ def _add_functionality():
                 DeprecationWarning,
                 stacklevel=2,
             )
-            args = (block,) + args
+            args = (block, *args)
 
         shared_size = kwargs.pop("shared_size", None)
         if shared_size is not None:
@@ -390,7 +397,7 @@ def _add_functionality():
                 DeprecationWarning,
                 stacklevel=2,
             )
-            args = (block,) + args
+            args = (block, *args)
 
         shared_size = kwargs.pop("shared_size", None)
         if shared_size is not None:
@@ -433,7 +440,7 @@ def _add_functionality():
                 DeprecationWarning,
                 stacklevel=2,
             )
-            args = (stream,) + args
+            args = (stream, *args)
             stream = block
 
         shared_size = kwargs.pop("shared_size", None)
@@ -522,12 +529,14 @@ def _add_functionality():
                 for handler in post_handlers:
                     handler.post_call(stream)
 
-    def function_prepare(func, arg_types, texrefs=[]):
+    def function_prepare(func, arg_types, texrefs=None):
+        if texrefs is None:
+            texrefs = []
         func.texrefs = texrefs
 
         func.arg_format = ""
 
-        for i, arg_type in enumerate(arg_types):
+        for _i, arg_type in enumerate(arg_types):
             if isinstance(arg_type, type) and np.number in arg_type.__mro__:
                 func.arg_format += np.dtype(arg_type).char
             elif isinstance(arg_type, np.dtype):
@@ -554,7 +563,7 @@ def _add_functionality():
                 DeprecationWarning,
                 stacklevel=2,
             )
-            args = (block,) + args
+            args = (block, *args)
 
         shared_size = kwargs.pop("shared_size", 0)
 
@@ -611,7 +620,7 @@ def _add_functionality():
                 DeprecationWarning,
                 stacklevel=2,
             )
-            args = (stream,) + args
+            args = (stream, *args)
             stream = block
 
         shared_size = kwargs.pop("shared_size", 0)
@@ -657,16 +666,8 @@ def _add_functionality():
             )
             return func(*args, **kwargs)
 
-        try:
-            from functools import update_wrapper
-        except ImportError:
-            pass
-        else:
-            try:
-                update_wrapper(new_func, func)
-            except Exception:
-                # User won't see true signature. Oh well.
-                pass
+        from functools import update_wrapper
+        update_wrapper(new_func, func)
 
         return new_func
 
@@ -1070,12 +1071,8 @@ def matrix_to_texref(matrix, texref, order):
 
 
 def to_device(bf_obj):
-    import sys
 
-    if sys.version_info >= (2, 7):
-        bf = memoryview(bf_obj).tobytes()
-    else:
-        bf = buffer(bf_obj)
+    bf = memoryview(bf_obj).tobytes()
     result = mem_alloc(len(bf))
     memcpy_htod(result, bf)
     return result
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 1ca6706f4d922444d00a33b56cb9d352dd1192e8..cc710409c832e92b2116e6c167b0417e3836a4b5 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -1,4 +1,6 @@
 """Elementwise functionality."""
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -26,12 +28,14 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 
 
-from pycuda.tools import context_dependent_memoize
 from typing import Any
+
 import numpy as np
-from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
+
 from pytools import memoize_method
 
+from pycuda.tools import ScalarArg, VectorArg, context_dependent_memoize, dtype_to_ctype
+
 
 def get_elwise_module(
     arguments,
@@ -170,10 +174,7 @@ def get_elwise_kernel_and_types(
     else:
         arguments.append(ScalarArg(np.uintp, "n"))
 
-    if use_range:
-        module_builder = get_elwise_range_module
-    else:
-        module_builder = get_elwise_module
+    module_builder = get_elwise_range_module if use_range else get_elwise_module
 
     mod = module_builder(arguments, operation, name, keep, options, **kwargs)
 
@@ -189,7 +190,7 @@ def get_elwise_kernel(
     """Return a L{pycuda.driver.Function} that performs the same scalar operation
     on one or several vectors.
     """
-    mod, func, arguments = get_elwise_kernel_and_types(
+    _mod, func, arguments = get_elwise_kernel_and_types(
         arguments, operation, name, keep, options, **kwargs
     )
 
@@ -211,7 +212,7 @@ class ElementwiseKernel:
             })
 
     def get_texref(self, name, use_range=False):
-        mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range)
+        mod, _knl, _arguments = self.generate_stride_kernel_and_types(use_range=use_range)
         return mod.get_texref(name)
 
     @memoize_method
@@ -241,7 +242,7 @@ class ElementwiseKernel:
             )
 
         invocation_args = []
-        mod, func, arguments = self.generate_stride_kernel_and_types(
+        _mod, func, arguments = self.generate_stride_kernel_and_types(
             range_ is not None or slice_ is not None
         )
 
@@ -414,8 +415,8 @@ def get_copy_kernel(dtype_dest, dtype_src):
 
 @context_dependent_memoize
 def get_linear_combination_kernel(summand_descriptors, dtype_z):
+    from pycuda.elementwise import ScalarArg, VectorArg, get_elwise_module
     from pycuda.tools import dtype_to_ctype
-    from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module
 
     args = []
     preamble = ["#include <pycuda-helpers.hpp>\n\n"]
@@ -461,7 +462,7 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z):
     return func, tex_src
 
 
-def _get_real_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]":
+def _get_real_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
     assert dtype.kind == "c"
     return np.empty(0, dtype).real.dtype
 
diff --git a/pycuda/gl/__init__.py b/pycuda/gl/__init__.py
index 3120d2de74f72fc4b8eddc54ea2b82475d3e259e..bbe607ec24aa6d2eac9300ea271df20894c32b63 100644
--- a/pycuda/gl/__init__.py
+++ b/pycuda/gl/__init__.py
@@ -1,6 +1,8 @@
-from __future__ import absolute_import
+from __future__ import annotations
+
 import pycuda._driver as _drv
 
+
 if not _drv.have_gl_ext():
     raise ImportError("PyCUDA was compiled without GL extension support")
 
diff --git a/pycuda/gl/autoinit.py b/pycuda/gl/autoinit.py
index fc8c9a668f365ff5aa2be3c89cc97413c634f87a..e4104774a812611152e3af560ab84f70b119a2dd 100644
--- a/pycuda/gl/autoinit.py
+++ b/pycuda/gl/autoinit.py
@@ -1,12 +1,17 @@
-from __future__ import absolute_import
+from __future__ import annotations
+
+import atexit
+
 import pycuda.driver as cuda
 import pycuda.gl as cudagl
-import atexit
+
 
 cuda.init()
 assert cuda.Device.count() >= 1
 
-from pycuda.tools import make_default_context  # noqa: E402
+from pycuda.tools import make_default_context
+
+
 context = make_default_context(lambda dev: cudagl.make_context(dev))
 device = context.get_device()
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index b2d3ac373d128eba036561606404095c29a1f254..cd5b7b233d54ab373f443395d39741d0c1718a47 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1,24 +1,29 @@
+from __future__ import annotations
+
+
 __copyright__ = """
 Copyright 2008-2021 Andreas Kloeckner
 Copyright 2021 NVIDIA Corporation
 """
 
+import copyreg
+import numbers
+from functools import reduce
+
 import numpy as np
-import pycuda.elementwise as elementwise
+
 from pytools import memoize, memoize_method
+
 import pycuda.driver as drv
+import pycuda.elementwise as elementwise
+from pycuda.characterize import has_double_support
 from pycuda.compyte.array import (
+    ArrayFlags as _ArrayFlags,
     as_strided as _as_strided,
-    f_contiguous_strides as _f_contiguous_strides,
     c_contiguous_strides as _c_contiguous_strides,
-    ArrayFlags as _ArrayFlags,
+    f_contiguous_strides as _f_contiguous_strides,
     get_common_dtype as _get_common_dtype_base,
 )
-from pycuda.characterize import has_double_support
-from functools import reduce
-import numbers
-
-import copyreg
 
 
 def _get_common_dtype(obj1, obj2):
@@ -286,10 +291,7 @@ class GPUArray:
     def __cuda_array_interface__(self):
         """Returns a CUDA Array Interface dictionary describing this array's
         data."""
-        if self.gpudata is not None:
-            ptr = int(self.gpudata)
-        else:
-            ptr = 0
+        ptr = int(self.gpudata) if self.gpudata is not None else 0
 
         return {
             "shape": self.shape,
@@ -828,10 +830,7 @@ class GPUArray:
         with new, the user can choose between ipow or just pow
         """
         common_dtype = _get_common_dtype(self, other)
-        if new:
-            result = self._new_like_me(common_dtype)
-        else:
-            result = self
+        result = self._new_like_me(common_dtype) if new else self
 
         # {{{ sanity checks
 
@@ -960,7 +959,7 @@ class GPUArray:
                 "only contiguous arrays may " "be used as arguments to this operation"
             )
 
-        if isinstance(shape[0], tuple) or isinstance(shape[0], list):
+        if isinstance(shape[0], (tuple, list)):
             shape = tuple(shape[0])
 
         same_contiguity = (order == "C" and self.flags.c_contiguous) or (
@@ -1012,14 +1011,14 @@ class GPUArray:
             raise ValueError("new type not compatible with array")
 
         new_shape = (
-            self.shape[:min_stride_axis]
-            + (self.shape[min_stride_axis] * old_itemsize // itemsize,)
-            + self.shape[min_stride_axis + 1:]
+            (*self.shape[:min_stride_axis],
+                self.shape[min_stride_axis] * old_itemsize // itemsize,
+                *self.shape[min_stride_axis + 1:])
         )
         new_strides = (
-            self.strides[:min_stride_axis]
-            + (self.strides[min_stride_axis] * itemsize // old_itemsize,)
-            + self.strides[min_stride_axis + 1:]
+            (*self.strides[:min_stride_axis],
+                self.strides[min_stride_axis] * itemsize // old_itemsize,
+                *self.strides[min_stride_axis + 1:])
         )
 
         return GPUArray(
@@ -1036,9 +1035,9 @@ class GPUArray:
         Returns a view of the array with dimensions of
         length 1 removed.
         """
-        new_shape = tuple([dim for dim in self.shape if dim > 1])
+        new_shape = tuple(dim for dim in self.shape if dim > 1)
         new_strides = tuple(
-            [self.strides[i] for i, dim in enumerate(self.shape) if dim > 1]
+            self.strides[i] for i, dim in enumerate(self.shape) if dim > 1
         )
 
         return GPUArray(
@@ -1188,10 +1187,7 @@ class GPUArray:
             from pytools import match_precision
 
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
+            order = "F" if self.flags.f_contiguous else "C"
             result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
@@ -1221,10 +1217,7 @@ class GPUArray:
             from pytools import match_precision
 
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
+            order = "F" if self.flags.f_contiguous else "C"
             result = self._new_like_me(dtype=real_dtype, order=order)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
@@ -1250,14 +1243,8 @@ class GPUArray:
                     "be used as arguments to this operation"
                 )
 
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            if out is None:
-                result = self._new_like_me(order=order)
-            else:
-                result = out
+            order = "F" if self.flags.f_contiguous else "C"
+            result = self._new_like_me(order=order) if out is None else out
 
             func = elementwise.get_conj_kernel(dtype, result.dtype)
             func.prepared_async_call(
@@ -1459,12 +1446,12 @@ def arange(*args, **kwargs):
 
         warn(
             "behavior change: arange guessed dtype other than float32. "
-            "suggest specifying explicit dtype."
+            "suggest specifying explicit dtype.", stacklevel=2
         )
 
     from math import ceil
 
-    size = int(ceil((stop - start) / step))
+    size = ceil((stop - start) / step)
 
     result = GPUArray((size,), dtype)
 
@@ -1897,7 +1884,9 @@ def concatenate(arrays, axis=0, allocator=None):
     base_idx = 0
     for ary in arrays:
         my_len = ary.shape[axis]
-        result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary
+        result[(*full_slice[:axis],
+            slice(base_idx, base_idx + my_len),
+            *full_slice[axis + 1:])] = ary
         base_idx += my_len
 
     return result
@@ -1928,7 +1917,7 @@ def stack(arrays, axis=0, allocator=None):
     if not (0 <= axis <= input_ndim):
         raise ValueError("invalid axis")
 
-    result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
+    result_shape = (*input_shape[:axis], len(arrays), *input_shape[axis:])
     result = empty(shape=result_shape,
             dtype=np.result_type(*(ary.dtype for ary in arrays)),
             allocator=allocator, order="C" if axis == 0 else "F")
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index 0ae3f094e86f59c495e004caf56b0740f9449344..ef5d0fc14b549574e147058e6957e898e41a39e3 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -1,4 +1,5 @@
 """Computation of reductions on vectors."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
@@ -57,10 +58,10 @@ Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 source code with only those rights set forth herein.
 """
 
-from pycuda.tools import context_dependent_memoize
-from pycuda.tools import dtype_to_ctype
 import numpy as np
 
+from pycuda.tools import context_dependent_memoize, dtype_to_ctype
+
 
 def get_reduction_module(
     out_type,
@@ -180,13 +181,10 @@ def get_reduction_kernel_and_types(
             map_expr = "pycuda_reduction_inp[i]"
 
         in_arg = "const %s *pycuda_reduction_inp" % out_type
-        if arguments:
-            arguments = in_arg + ", " + arguments
-        else:
-            arguments = in_arg
+        arguments = in_arg + ", " + arguments if arguments else in_arg
 
     else:
-        assert False
+        raise AssertionError()
 
     mod = get_reduction_module(
         out_type,
@@ -264,8 +262,8 @@ class ReductionKernel:
         )
 
     def __call__(self, *args, **kwargs):
-        MAX_BLOCK_COUNT = 1024
-        SMALL_SEQ_COUNT = 4
+        MAX_BLOCK_COUNT = 1024  # noqa: N806
+        SMALL_SEQ_COUNT = 4  # noqa: N806
 
         s1_func = self.stage1_func
         s2_func = self.stage2_func
@@ -304,7 +302,7 @@ class ReductionKernel:
             repr_vec = vectors[0]
             sz = repr_vec.size
 
-            allocator = kwargs.get("allocator", None)
+            allocator = kwargs.get("allocator")
             if allocator is None:
                 allocator = repr_vec.allocator
 
@@ -335,7 +333,7 @@ class ReductionKernel:
                 (block_count, 1),
                 (self.block_size, 1, 1),
                 stream,
-                *([result.gpudata] + invocation_args + [seq_count, sz]),
+                *([result.gpudata, *invocation_args, seq_count, sz]),
                 **kwargs
             )
 
@@ -344,7 +342,7 @@ class ReductionKernel:
             else:
                 f = s2_func
                 arg_types = self.stage2_arg_types
-                args = (result,) + stage1_args
+                args = (result, *stage1_args)
 
 
 @context_dependent_memoize
@@ -426,10 +424,7 @@ def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None):
         dtype_out = dtype_a
 
     if dtype_b is None:
-        if dtype_a is None:
-            dtype_b = dtype_out
-        else:
-            dtype_b = dtype_a
+        dtype_b = dtype_out if dtype_a is None else dtype_a
 
     if dtype_a is None:
         dtype_a = dtype_out
diff --git a/pycuda/scan.py b/pycuda/scan.py
index 1a1b5b937763de81cec2f1af1c1d3b4751ecc3e4..65a9b0d6c9498182bf290530c93903d838af2579 100644
--- a/pycuda/scan.py
+++ b/pycuda/scan.py
@@ -1,4 +1,6 @@
 """Scan primitive."""
+from __future__ import annotations
+
 
 __copyright__ = """
 Copyright 2011 Andreas Kloeckner
@@ -25,12 +27,12 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/
 
 import numpy as np
 
+import pycuda._mymako as mako
 import pycuda.driver as driver
 import pycuda.gpuarray as gpuarray
+from pycuda._cluda import CLUDA_PREAMBLE
 from pycuda.compiler import SourceModule
 from pycuda.tools import dtype_to_ctype
-import pycuda._mymako as mako
-from pycuda._cluda import CLUDA_PREAMBLE
 
 
 SHARED_PREAMBLE = (
diff --git a/pycuda/sparse/__init__.py b/pycuda/sparse/__init__.py
index 0949981b69451fbb55ca1c8ed78af7f9a37457f6..16b468914cc90bb832c005cc3920d72c5bfaf3ba 100644
--- a/pycuda/sparse/__init__.py
+++ b/pycuda/sparse/__init__.py
@@ -1,6 +1,8 @@
-from __future__ import absolute_import
+from __future__ import annotations
+
 from warnings import warn
 
+
 warn(
     "pycuda.sparse is deprecated. and will be removed in 2015.x",
     DeprecationWarning,
diff --git a/pycuda/sparse/cg.py b/pycuda/sparse/cg.py
index 42148ecec971e0743b51c8309c85099cf9b86138..4079c1fa12218691c3a652581467a7eae4c2611a 100644
--- a/pycuda/sparse/cg.py
+++ b/pycuda/sparse/cg.py
@@ -1,11 +1,12 @@
-from __future__ import division
-from __future__ import absolute_import
-from pycuda.sparse.inner import AsyncInnerProduct
+from __future__ import annotations
+
+import numpy as np
+
 from pytools import memoize_method
+
 import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
-
-import numpy as np
+from pycuda.sparse.inner import AsyncInnerProduct
 
 
 class ConvergenceError(RuntimeError):
@@ -158,10 +159,7 @@ class CGStateContainer:
             self.one_iteration(compute_real_residual=compute_real_residual)
 
             if debug_callback is not None:
-                if compute_real_residual:
-                    what = "it+residual"
-                else:
-                    what = "it"
+                what = "it+residual" if compute_real_residual else "it"
 
                 debug_callback(
                     what, iterations, self.x, self.residual, self.d, self.delta
diff --git a/pycuda/sparse/coordinate.py b/pycuda/sparse/coordinate.py
index 8299387f97f0c5f9e0a43ba8315b1e6705678387..13191a86b4f90ebc44f7c72f8160712d86663569 100644
--- a/pycuda/sparse/coordinate.py
+++ b/pycuda/sparse/coordinate.py
@@ -1,10 +1,12 @@
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import annotations
+
+import numpy as np
+
 from pytools import memoize_method
+
 import pycuda.driver as drv
 import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
-import numpy as np
 
 
 COO_FLAT_KERNEL_TEMPLATE = """
diff --git a/pycuda/sparse/inner.py b/pycuda/sparse/inner.py
index 7e55e3078130ecfca3c1592d2a621797eb18470b..7e61498f56ae65698765604f67feb591a9d8efed 100644
--- a/pycuda/sparse/inner.py
+++ b/pycuda/sparse/inner.py
@@ -1,10 +1,10 @@
-from __future__ import division
-from __future__ import absolute_import
-import pycuda.driver as drv
-import pycuda.gpuarray as gpuarray
+from __future__ import annotations
 
 import atexit
 
+import pycuda.driver as drv
+import pycuda.gpuarray as gpuarray
+
 
 STREAM_POOL = []
 
diff --git a/pycuda/sparse/operator.py b/pycuda/sparse/operator.py
index bf8231c29fabffbdc1a70035b27735d9ab7d0240..9785564179995d082ed378c572f33550e1dfd40b 100644
--- a/pycuda/sparse/operator.py
+++ b/pycuda/sparse/operator.py
@@ -1,4 +1,7 @@
-class OperatorBase(object):
+from __future__ import annotations
+
+
+class OperatorBase:
     @property
     def dtype(self):
         raise NotImplementedError
diff --git a/pycuda/sparse/packeted.py b/pycuda/sparse/packeted.py
index f2819db81bfc0be180c71beb68e802b4040b7c67..c06ae3226a805dce33839623477283bad5cb14dc 100644
--- a/pycuda/sparse/packeted.py
+++ b/pycuda/sparse/packeted.py
@@ -1,10 +1,11 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
+from __future__ import annotations
+
+import numpy as np
+
 from pytools import memoize_method
+
 import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
-import numpy as np
 
 
 PKT_KERNEL_TEMPLATE = """
@@ -120,7 +121,7 @@ class PacketedSpMV:
             adj_mat = csr_mat
 
         while True:
-            cut_count, dof_to_packet_nr = part_graph(
+            _cut_count, dof_to_packet_nr = part_graph(
                 int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices
             )
 
@@ -148,10 +149,10 @@ class PacketedSpMV:
                 old_block_count = self.block_count
                 self.block_count = int(2 + 1.05 * self.block_count)
                 print(
-                    (
+
                         "Metis produced a big block at block count "
                         "%d--retrying with %d" % (old_block_count, self.block_count)
-                    )
+
                 )
                 continue
 
@@ -230,7 +231,7 @@ class PacketedSpMV:
     def find_local_row_costs_and_remaining_coo(
         self, csr_mat, dof_to_packet_nr, old2new_fetch_indices
     ):
-        h, w = self.shape
+        h, _w = self.shape
         local_row_costs = [0] * h
         rem_coo_values = []
         rem_coo_i = []
diff --git a/pycuda/sparse/pkt_build.py b/pycuda/sparse/pkt_build.py
index b3591584bd6b54f492a53e3da9fdc1ebca8b0e2e..235e1a0f782fc94477e7addb06bd5a1cf5f6693d 100644
--- a/pycuda/sparse/pkt_build.py
+++ b/pycuda/sparse/pkt_build.py
@@ -1,5 +1,7 @@
-from __future__ import absolute_import
+from __future__ import annotations
+
 import numpy as np
+
 import pycuda.gpuarray as gpuarray
 
 
@@ -74,4 +76,6 @@ except ImportError:
     pass
 else:
     pyximport.install()
-    from pycuda.sparse.pkt_build_cython import build_pkt_data_structure  # noqa: F811, F401
+    from pycuda.sparse.pkt_build_cython import (
+        build_pkt_data_structure,  # noqa: F401
+    )
diff --git a/pycuda/tools.py b/pycuda/tools.py
index 3e0ccbd2cea5378294e12c9a8cee1fabcf6269c6..e37e616f3bea6ad4fe4b60b1b7d1920e4a4545a7 100644
--- a/pycuda/tools.py
+++ b/pycuda/tools.py
@@ -1,4 +1,6 @@
 """Miscallenous helper functionality."""
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
 
@@ -25,18 +27,20 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 """
 
-import pycuda.driver as cuda
-import pycuda._driver as _drv
-import numpy as np
+import contextlib
 
+import numpy as np
 
+import pycuda._driver as _drv
+import pycuda.driver as cuda
 from pycuda.compyte.dtypes import (  # noqa: F401
-    register_dtype,
-    get_or_register_dtype,
     _fill_dtype_registry,
     dtype_to_ctype as base_dtype_to_ctype,
+    get_or_register_dtype,
+    register_dtype,
 )
 
+
 bitlog2 = _drv.bitlog2
 DeviceMemoryPool = _drv.DeviceMemoryPool
 PageLockedMemoryPool = _drv.PageLockedMemoryPool
@@ -91,9 +95,8 @@ class DebugMemoryPool(DeviceMemoryPool):
             "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):"
             "\n      at: %s"
             % (
-                (size, self.last_free)
-                + cuda.mem_get_info()
-                + (self.held_blocks, self.active_blocks, description)
+                (size, self.last_free, *cuda.mem_get_info(),
+                    self.held_blocks, self.active_blocks, description)
             ),
             file=self.logfile,
         )
@@ -144,23 +147,19 @@ def get_default_device(default=0):
 
     warn(
         "get_default_device() is deprecated; " "use make_default_context() instead",
-        DeprecationWarning,
+        DeprecationWarning, stacklevel=2,
     )
 
-    from pycuda.driver import Device
     import os
 
+    from pycuda.driver import Device
+
     dev = os.environ.get("CUDA_DEVICE")
 
     if dev is None:
-        try:
-            dev = (
-                open(os.path.join(os.path.expanduser("~"), ".cuda_device"))
-                .read()
-                .strip()
-            )
-        except Exception:
-            pass
+        with contextlib.suppress(Exception):  # noqa: SIM117
+            with open(os.path.join(os.path.expanduser("~"), ".cuda_device")) as devrc:
+                dev = devrc.read().strip()
 
     if dev is None:
         dev = default
@@ -197,7 +196,8 @@ def make_default_context(ctx_maker=None):
         try:
             homedir = os.environ.get("HOME")
             assert homedir is not None
-            devn = open(os.path.join(homedir, ".cuda_device")).read().strip()
+            with open(os.path.join(homedir, ".cuda_device")) as devrc:
+                devn = devrc.read().strip()
         except Exception:
             pass
 
@@ -247,7 +247,7 @@ def _int_ceiling(value, multiple_of=1):
 
     from math import ceil
 
-    return int(ceil(value / multiple_of)) * multiple_of
+    return ceil(value / multiple_of) * multiple_of
 
 
 def _int_floor(value, multiple_of=1):
@@ -256,7 +256,7 @@ def _int_floor(value, multiple_of=1):
 
     from math import floor
 
-    return int(floor(value / multiple_of)) * multiple_of
+    return floor(value / multiple_of) * multiple_of
 
 
 # }}}
@@ -314,9 +314,7 @@ class DeviceData:
     def align_bytes(self, word_size=4):
         if word_size == 4:
             return 64
-        elif word_size == 8:
-            return 128
-        elif word_size == 16:
+        elif word_size == 8 or word_size == 16:
             return 128
         else:
             raise ValueError("no alignment possible for fetches of size %d" % word_size)
@@ -452,10 +450,7 @@ context_dependent_memoized_functions = []
 
 def context_dependent_memoize(func):
     def wrapper(*args, **kwargs):
-        if kwargs:
-            cache_key = (args, frozenset(kwargs.items()))
-        else:
-            cache_key = (args,)
+        cache_key = (args, frozenset(kwargs.items())) if kwargs else (args,)
 
         try:
             ctx_dict = func._pycuda_ctx_dep_memoize_dic
diff --git a/pyproject.toml b/pyproject.toml
index 8d784a39029a9ae13aef61b18f064dec0e6d3aeb..fb3d887696ec40085d1bfc3fce410d8f395cf0cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,91 @@ requires = [
     "wheel",
     "numpy>=1.24",
 ]
+
+[tool.ruff]
+preview = true
+# FIXME
+line-length = 100
+
+[tool.ruff.lint]
+extend-select = [
+    "B",   # flake8-bugbear
+    "C",   # flake8-comprehensions
+    "E",   # pycodestyle
+    "F",   # pyflakes
+    "G",   # flake8-logging-format
+    "I",   # flake8-isort
+    "N",   # pep8-naming
+    "NPY", # numpy
+    "PGH", # pygrep-hooks
+    "Q",   # flake8-quotes
+    "RUF", # ruff
+    "SIM", # flake8-simplify
+    "TC",  # flake8-type-checking
+    "UP",  # pyupgrade
+    "W",   # pycodestyle
+]
+extend-ignore = [
+    "C90",  # McCabe complexity
+    "E221", # multiple spaces before operator
+    "E226", # missing whitespace around arithmetic operator
+    "E402", # module-level import not at top of file
+    "UP031", # use f-strings instead of %
+    "UP032", # use f-strings instead of .format
+    "NPY002", # legacy random
+    "SIM102", # single if instead of nested
+    "PGH004", # noqa requires rule
+    "B904",  # raise within except: use from
+    "RUF012",  # ClassVar
+    "SIM108", # ternary instead of if-then-else
+]
+
+[tool.ruff.lint.flake8-quotes]
+docstring-quotes = "double"
+inline-quotes = "double"
+multiline-quotes = "double"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-local-folder = [
+    "pycuda",
+]
+known-first-party = [
+    "pytools",
+]
+lines-after-imports = 2
+required-imports = ["from __future__ import annotations"]
+
+[tool.ruff.lint.pep8-naming]
+extend-ignore-names = ["update_for_*"]
+
+[tool.ruff.lint.per-file-ignores]
+"test/test_*.py" = [
+    "N806", # upper case locals
+]
+"setup.py" = [
+    "N806", # upper case locals
+    "SIM115",  # context manager for files
+]
+"pycuda/sparse/coordinate.py" = [
+    "E501", # line length
+]
+"pycuda/driver.py" = [
+    "F405", # undefined symbol
+    "N806", # upper case locals
+]
+"pycuda/curandom.py" = [
+    "E501", # line length
+]
+"examples/from-wiki/*.py" = [
+    "F", "E", "N", "B"
+]
+"examples/demo_cdpSimplePrint.py" = [
+    "E501", # line length
+    "N816", # mixed case locals
+]
+"aksetup_helper.py" = [
+    # effectively unmaintained, will go away
+    "UP", "C", "E501", "B", "SIM", "RUF",
+]
+"test/undistributed/*.py" = ["B"]
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 291835073f52a47141bc239a0be603bc742d7d39..0000000000000000000000000000000000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-[flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,W503,N806,F405,E501
-# FIXME: fix in future: N806, F405, E501
-max-line-length=85
-exclude=pycuda/compyte/ndarray,pycuda/compyte/array.py
-
-inline-quotes = "
-docstring-quotes = """
-multiline-quotes = """
diff --git a/setup.py b/setup.py
index 5dd88d2920f6fcdc871b4e771ec1148835e5b281..a15ef2b8b3b1af1c13ad67ffbccaf844983d4ef7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
+from __future__ import annotations
 
-from __future__ import absolute_import, print_function
 from os.path import dirname, join, normpath
 
 
@@ -9,8 +8,8 @@ def search_on_path(filenames):
     """Find file on system path."""
     # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
 
-    from os.path import exists, abspath
-    from os import pathsep, environ
+    from os import environ, pathsep
+    from os.path import abspath, exists
 
     search_path = environ["PATH"]
 
@@ -23,14 +22,14 @@ def search_on_path(filenames):
 
 def get_config_schema():
     from aksetup_helper import (
+        BoostLibraries,
         ConfigSchema,
-        Option,
         IncludeDir,
-        LibraryDir,
         Libraries,
-        BoostLibraries,
-        Switch,
+        LibraryDir,
+        Option,
         StringListOption,
+        Switch,
         make_boost_base_options,
     )
 
@@ -79,19 +78,15 @@ def get_config_schema():
         default_lib_dirs.append("/usr/local/cuda/lib")
 
     return ConfigSchema(
-        make_boost_base_options()
-        + [
+        [*make_boost_base_options(),
             Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"),
             BoostLibraries("python"),
             BoostLibraries("thread"),
             Switch("CUDA_TRACE", False, "Enable CUDA API tracing"),
-            Option(
-                "CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit"
-            ),
-            Option(
-                "CUDA_PRETEND_VERSION",
-                help="Assumed CUDA version, in the form 3010 for 3.1.",
-            ),
+            Option("CUDA_ROOT", default=cuda_root_default,
+                   help="Path to the CUDA toolkit"),
+            Option("CUDA_PRETEND_VERSION",
+                   help="Assumed CUDA version, in the form 3010 for 3.1."),
             IncludeDir("CUDA", None),
             Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"),
             Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"),
@@ -101,15 +96,10 @@ def get_config_schema():
             Libraries("CUDART", ["cudart"]),
             LibraryDir("CURAND", default_lib_dirs),
             Libraries("CURAND", ["curand"]),
-            StringListOption(
-                "CXXFLAGS",
-                cxxflags_default,
-                help="Any extra C++ compiler options to include",
-            ),
-            StringListOption(
-                "LDFLAGS", ldflags_default, help="Any extra linker options to include"
-            ),
-        ]
+            StringListOption("CXXFLAGS", cxxflags_default,
+                             help="Any extra C++ compiler options to include"),
+            StringListOption("LDFLAGS", ldflags_default,
+                             help="Any extra linker options to include")]
     )
 
 
@@ -117,13 +107,13 @@ def main():
     import sys
 
     from aksetup_helper import (
-        hack_distutils,
-        get_config,
-        setup,
         ExtensionUsingNumpy,
-        set_up_shipped_boost_if_requested,
-        check_git_submodules,
         NumpyBuildExtCommand,
+        check_git_submodules,
+        get_config,
+        hack_distutils,
+        set_up_shipped_boost_if_requested,
+        setup,
     )
 
     check_git_submodules()
@@ -194,7 +184,7 @@ def main():
         # metadata
         version=ver_dic["VERSION_TEXT"],
         description="Python wrapper for Nvidia CUDA",
-        long_description=open("README.rst", "rt").read(),
+        long_description=open("README.rst").read(),
         author="Andreas Kloeckner",
         author_email="inform@tiker.net",
         license="MIT",
@@ -233,13 +223,11 @@ def main():
         ext_modules=[
             ExtensionUsingNumpy(
                 "_driver",
-                [
-                    "src/cpp/cuda.cpp",
+                ["src/cpp/cuda.cpp",
                     "src/cpp/bitlog.cpp",
                     "src/wrapper/wrap_cudadrv.cpp",
                     "src/wrapper/mempool.cpp",
-                ]
-                + EXTRA_SOURCES,
+                    *EXTRA_SOURCES],
                 include_dirs=INCLUDE_DIRS,
                 library_dirs=LIBRARY_DIRS,
                 libraries=LIBRARIES,
diff --git a/test/test_cumath.py b/test/test_cumath.py
index 2db96d15573d1ba6878362ae734c6b387ddf25db..4267e120a7037339500bf3f6ecc5229febbb0e03 100644
--- a/test/test_cumath.py
+++ b/test/test_cumath.py
@@ -1,11 +1,13 @@
+from __future__ import annotations
+
 import math
-import numpy as np
-from pycuda.tools import mark_cuda_test
 
+import numpy as np
 
-import pycuda.gpuarray as gpuarray
-import pycuda.driver as drv  # noqa
 import pycuda.cumath as cumath
+import pycuda.driver as drv  # noqa
+import pycuda.gpuarray as gpuarray
+from pycuda.tools import mark_cuda_test
 
 
 sizes = [10, 128, 1024, 1 << 10, 1 << 13]
@@ -24,13 +26,10 @@ def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False):
     def test():
         gpu_func = getattr(cumath, name)
         cpu_func = getattr(np, numpy_func_names.get(name, name))
-        if complex:
-            _dtypes = complex_dtypes
-        else:
-            _dtypes = dtypes
+        dtypes_ = complex_dtypes if complex else dtypes
 
         for s in sizes:
-            for dtype in _dtypes:
+            for dtype in dtypes_:
                 np.random.seed(1)
                 A = (np.random.random(s) * (b - a) + a).astype(dtype)
                 if complex:
diff --git a/test/test_driver.py b/test/test_driver.py
index d167aed218b96a18cf49c12e61958eecdf547fcc..5e9befe7464aaed0d2f3a6046ee56c253c619638 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = """
 Copyright 2008-2021 Andreas Kloeckner
 Copyright 2021 NVIDIA Corporation
@@ -5,13 +8,12 @@ Copyright 2021 NVIDIA Corporation
 
 import numpy as np
 import numpy.linalg as la
-from pycuda.tools import mark_cuda_test, dtype_to_ctype
-import pytest  # noqa
-
+import pytest
 
-import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
+import pycuda.gpuarray as gpuarray
 from pycuda.compiler import SourceModule
+from pycuda.tools import dtype_to_ctype, mark_cuda_test
 
 
 class TestDriver:
@@ -126,7 +128,7 @@ class TestDriver:
         strm.synchronize()
 
         dest = drv.pagelocked_empty_like(a)
-        multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm)
+        multiply_them(drv.Out(dest), a_gpu, b_gpu, block=(*shape, 1), stream=strm)
         strm.synchronize()
 
         drv.memcpy_dtoh_async(a, a_gpu, strm)
@@ -206,7 +208,7 @@ class TestDriver:
         drv.matrix_to_texref(a, mtx_tex, order="F")
 
         dest = np.zeros(shape, dtype=np.float32)
-        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
+        copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex])
         assert la.norm(dest - a) == 0
 
     @mark_cuda_test
@@ -243,7 +245,7 @@ class TestDriver:
         drv.matrix_to_texref(b, mtx2_tex, order="F")
 
         dest = np.zeros(shape, dtype=np.float32)
-        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex, mtx2_tex])
+        copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex, mtx2_tex])
         assert la.norm(dest - a - b) < 1e-6
 
     @mark_cuda_test
@@ -276,12 +278,12 @@ class TestDriver:
         shape = (5, 6)
         channels = 4
         a = np.asarray(
-            np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F"
+            np.random.randn(*((channels, *shape))), dtype=np.float32, order="F"
         )
         drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex)
 
-        dest = np.zeros(shape + (channels,), dtype=np.float32)
-        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
+        dest = np.zeros((*shape, channels), dtype=np.float32)
+        copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex])
         reshaped_a = a.transpose(1, 2, 0)
         # print reshaped_a
         # print dest
@@ -314,13 +316,13 @@ class TestDriver:
 
         shape = (16, 16)
         channels = 4
-        a = np.random.randn(*(shape + (channels,))).astype(np.float32)
+        a = np.random.randn(*((*shape, channels))).astype(np.float32)
         a_gpu = drv.to_device(a)
         mtx_tex.set_address(a_gpu, a.nbytes)
         mtx_tex.set_format(drv.array_format.FLOAT, 4)
 
-        dest = np.zeros(shape + (channels,), dtype=np.float32)
-        copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex])
+        dest = np.zeros((*shape, channels), dtype=np.float32)
+        copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex])
         # print a
         # print dest
         assert la.norm(dest - a) == 0
@@ -470,7 +472,9 @@ class TestDriver:
               int row   = blockIdx.x*blockDim.x + threadIdx.x;
               int col   = blockIdx.y*blockDim.y + threadIdx.y;
               int slice = blockIdx.z*blockDim.z + threadIdx.z;
-              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
+              dest[row + col*blockDim.x*gridDim.x
+                + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(
+                    mtx_tex, slice, col, row);
             }
             """
             myKern = myKern.replace("fpName", fpName_str)
@@ -537,7 +541,8 @@ class TestDriver:
               int row   = blockIdx.x*blockDim.x + threadIdx.x;
               int col   = blockIdx.y*blockDim.y + threadIdx.y;
               int slice = blockIdx.z*blockDim.z + threadIdx.z;
-              int tid = row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
+              int tid = row + col*blockDim.x*gridDim.x
+                  + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
               if (rw==0){
               cuPres aux = dest[tid];
               fp_surf3Dwrite(aux, mtx_tex, row, col, slice,cudaBoundaryModeClamp);}
@@ -694,11 +699,12 @@ class TestDriver:
 
     @mark_cuda_test
     def test_mempool_2(self):
-        from pycuda.tools import DeviceMemoryPool
         from random import randrange
+
+        from pycuda.tools import DeviceMemoryPool
         pool = DeviceMemoryPool()
 
-        for i in range(2000):
+        for _i in range(2000):
             s = randrange(1 << 31) >> randrange(32)
             bin_nr = pool.bin_number(s)
             asize = pool.alloc_size(bin_nr)
@@ -709,17 +715,16 @@ class TestDriver:
 
     @mark_cuda_test
     def test_mempool(self):
-        from pycuda.tools import bitlog2
-        from pycuda.tools import DeviceMemoryPool
+        from pycuda.tools import DeviceMemoryPool, bitlog2
 
         pool = DeviceMemoryPool()
         queue = []
-        free, total = drv.mem_get_info()
+        free, _total = drv.mem_get_info()
 
         e0 = bitlog2(free)
 
         for e in range(e0 - 6, e0 - 4):
-            for i in range(100):
+            for _i in range(100):
                 queue.append(pool.allocate(1 << e))
                 if len(queue) > 10:
                     queue.pop(0)
@@ -906,11 +911,7 @@ class TestDriver:
             dest = np.zeros(shape, dtype=tp)
             copy_texture(
                 drv.Out(dest),
-                block=shape
-                + (
-                    1,
-                    1,
-                ),
+                block=(*shape, 1, 1),
                 texrefs=[my_tex],
             )
 
@@ -1084,11 +1085,11 @@ class TestDriver:
         mod = DynamicModule()
         mod.add_source(
             test_outer_cu,
-            nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS),
+            nvcc_options=(["-rdc=true", "-lcudadevrt", *DEFAULT_NVCC_FLAGS]),
         )
         mod.add_source(
             test_inner_cu,
-            nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS),
+            nvcc_options=(["-rdc=true", "-lcudadevrt", *DEFAULT_NVCC_FLAGS]),
         )
         mod.add_stdlib("cudadevrt")
         mod.link()
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 0bc37eb405e41482149303339504a8a623c4b940..6bc0556618dc9d2ff820752d7ec8510e1ab2fc2c 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1,16 +1,18 @@
 #! /usr/bin/env python
+from __future__ import annotations
+
+import operator
+import sys
 
 import numpy as np
 import numpy.linalg as la
-import sys
-from pycuda.tools import init_cuda_context_fixture
-from pycuda.characterize import has_double_support
+import pytest
 
-import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
+import pycuda.gpuarray as gpuarray
+from pycuda.characterize import has_double_support
 from pycuda.compiler import SourceModule
-import pytest
-import operator
+from pycuda.tools import init_cuda_context_fixture
 
 
 @pytest.fixture(autouse=True)
@@ -235,7 +237,7 @@ class TestGPUArray:
 
         assert (7 + a == a_added).all()
 
-    def test_substract_array(self):
+    def test_subtract_array(self):
         """Test the subtraction of two arrays."""
         # test data
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
@@ -312,15 +314,12 @@ class TestGPUArray:
     def test_random(self):
         from pycuda.curandom import rand as curand
 
-        if has_double_support():
-            dtypes = [np.float32, np.float64]
-        else:
-            dtypes = [np.float32]
+        dtypes = [np.float32, np.float64] if has_double_support() else [np.float32]
 
         for dtype in dtypes:
             a = curand((10, 100), dtype=dtype).get()
 
-            assert (0 <= a).all()
+            assert (a >= 0).all()
             assert (a < 1).all()
 
     def test_curand_wrappers(self):
@@ -334,8 +333,8 @@ class TestGPUArray:
         generator_types = []
         if get_curand_version() >= (3, 2, 0):
             from pycuda.curandom import (
-                XORWOWRandomNumberGenerator,
                 Sobol32RandomNumberGenerator,
+                XORWOWRandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -344,8 +343,8 @@ class TestGPUArray:
         if get_curand_version() >= (4, 0, 0):
             from pycuda.curandom import (
                 ScrambledSobol32RandomNumberGenerator,
-                Sobol64RandomNumberGenerator,
                 ScrambledSobol64RandomNumberGenerator,
+                Sobol64RandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -360,10 +359,7 @@ class TestGPUArray:
 
             generator_types.extend([MRG32k3aRandomNumberGenerator])
 
-        if has_double_support():
-            dtypes = [np.float32, np.float64]
-        else:
-            dtypes = [np.float32]
+        dtypes = [np.float32, np.float64] if has_double_support() else [np.float32]
 
         for gen_type in generator_types:
             gen = gen_type()
@@ -380,13 +376,13 @@ class TestGPUArray:
 
                 x = gen.gen_uniform(10000, dtype)
                 x_host = x.get()
-                assert (-1 <= x_host).all()
+                assert (x_host >= -1).all()
                 assert (x_host <= 1).all()
 
             gen.gen_uniform(10000, np.uint32)
             if get_curand_version() >= (5, 0, 0):
                 gen.gen_poisson(10000, np.uint32, 13.0)
-                for dtype in dtypes + [np.uint32]:
+                for dtype in [*dtypes, np.uint32]:
                     a = gpuarray.empty(1000000, dtype=dtype)
                     v = 10
                     a.fill(v)
@@ -411,8 +407,8 @@ class TestGPUArray:
         generator_types = []
         if get_curand_version() >= (3, 2, 0):
             from pycuda.curandom import (
-                XORWOWRandomNumberGenerator,
                 Sobol32RandomNumberGenerator,
+                XORWOWRandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -421,8 +417,8 @@ class TestGPUArray:
         if get_curand_version() >= (4, 0, 0):
             from pycuda.curandom import (
                 ScrambledSobol32RandomNumberGenerator,
-                Sobol64RandomNumberGenerator,
                 ScrambledSobol64RandomNumberGenerator,
+                Sobol64RandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -457,7 +453,7 @@ class TestGPUArray:
                 x = gen.gen_uniform(2 ** 31, dtype)
                 if dtype in [np.float32, np.float64]:
                     x_host = x.get()
-                    assert (-1 <= x_host).all()
+                    assert (x_host >= -1).all()
                     assert (x_host <= 1).all()
                 del x
 
@@ -468,7 +464,7 @@ class TestGPUArray:
                     v = 10
                     a.fill(v)
                     gen.fill_poisson(a)
-                    tmp = (a.get() == (v - 1)).sum() / a.size  # noqa: F841
+                    tmp = (a.get() == (v - 1)).sum() / a.size
                     # Check Poisson statistics (need 1e6 values)
                     # Compare with scipy.stats.poisson.pmf(v - 1, v)
                     assert np.isclose(0.12511, tmp, atol=0.005)
@@ -488,8 +484,8 @@ class TestGPUArray:
         generator_types = []
         if get_curand_version() >= (3, 2, 0):
             from pycuda.curandom import (
-                XORWOWRandomNumberGenerator,
                 Sobol32RandomNumberGenerator,
+                XORWOWRandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -498,8 +494,8 @@ class TestGPUArray:
         if get_curand_version() >= (4, 0, 0):
             from pycuda.curandom import (
                 ScrambledSobol32RandomNumberGenerator,
-                Sobol64RandomNumberGenerator,
                 ScrambledSobol64RandomNumberGenerator,
+                Sobol64RandomNumberGenerator,
             )
 
             generator_types.extend(
@@ -538,7 +534,7 @@ class TestGPUArray:
                 x = gen.gen_uniform(s, dtype)
                 if dtype in [np.float32, np.float64]:
                     x_host = x.get()
-                    assert (-1 <= x_host).all()
+                    assert (x_host >= -1).all()
                     assert (x_host <= 1).all()
                 del x
 
@@ -550,7 +546,7 @@ class TestGPUArray:
                     v = 10
                     a.fill(v)
                     gen.fill_poisson(a)
-                    tmp = (a.get() == (v - 1)).sum() / a.size  # noqa: F841
+                    tmp = (a.get() == (v - 1)).sum() / a.size
                     # Check Poisson statistics (need 1e6 values)
                     # Compare with scipy.stats.poisson.pmf(v - 1, v)
                     assert np.isclose(0.12511, tmp, atol=0.005)
@@ -638,7 +634,7 @@ class TestGPUArray:
             # a[i] = float('nan')
             from random import randrange
 
-            for i in range(size // 10):
+            for _i in range(size // 10):
                 a[randrange(0, size)] = float("nan")
             return a
 
@@ -921,10 +917,11 @@ class TestGPUArray:
                 def allocator(size):
                     nonlocal alloc_uses, pool
                     alloc_uses += 1
-                    return pool.allocate(size)
+                    return pool.allocate(size)  # noqa: B023
 
                 alloc = None if pool is None else allocator
-                sum_a_gpu = gpuarray.subset_sum(meaningful_indices_gpu, a_gpu, allocator=alloc).get()
+                sum_a_gpu = gpuarray.subset_sum(
+                            meaningful_indices_gpu, a_gpu, allocator=alloc).get()
                 assert np.allclose(sum_a_gpu, sum_a)
                 if pool is not None:
                     assert alloc_uses == 1
@@ -970,7 +967,7 @@ class TestGPUArray:
 
         from random import randrange
 
-        for i in range(200):
+        for _i in range(200):
             start = randrange(sz)
             end = randrange(start, sz)
 
@@ -989,7 +986,7 @@ class TestGPUArray:
 
         from random import randrange
 
-        for i in range(200):
+        for _i in range(200):
             start = randrange(n)
             end = randrange(start, n)
 
@@ -999,8 +996,8 @@ class TestGPUArray:
             assert la.norm(a_gpu_slice.get() - a_slice) == 0
 
     def test_2d_slice_f(self):
-        from pycuda.curandom import rand as curand
         import pycuda.gpuarray as gpuarray
+        from pycuda.curandom import rand as curand
 
         n = 1000
         m = 300
@@ -1012,7 +1009,7 @@ class TestGPUArray:
 
         from random import randrange
 
-        for i in range(200):
+        for _i in range(200):
             start = randrange(n)
             end = randrange(start, n)
 
diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py
index 8562b99c0d5afb3296a628c3ee2d9e0d18c3514d..960c275b6c0d58bcb8794c0d062c8125ea9f75f1 100644
--- a/test/undistributed/elwise-perf.py
+++ b/test/undistributed/elwise-perf.py
@@ -1,7 +1,6 @@
-import pycuda.driver as drv
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
-import numpy.linalg as la
 
 
 def main():
@@ -22,17 +21,14 @@ def main():
         a.fill(1)
         b.fill(2)
 
-        if power > 20:
-            count = 10
-        else:
-            count = 100
+        count = 10 if power > 20 else 100
 
         elapsed = [0]
 
         def add_timer(_, time):
             elapsed[0] += time()
 
-        for i in range(count):
+        for _i in range(count):
             a.mul_add(1, b, 2, add_timer)
 
         bytes = a.nbytes * count * 3
diff --git a/test/undistributed/measure_gpuarray_speed.py b/test/undistributed/measure_gpuarray_speed.py
index 4f6f390e621aa90cbcd9f19f7eacab44e6773c11..4e441fde9a749b5a09a02db3b2f477dcd0a2349c 100755
--- a/test/undistributed/measure_gpuarray_speed.py
+++ b/test/undistributed/measure_gpuarray_speed.py
@@ -1,8 +1,9 @@
 #! /usr/bin/env python
-import pycuda.driver as drv
-import pycuda.autoinit
+from __future__ import annotations
+
 import numpy
-import numpy.linalg as la
+
+import pycuda.driver as drv
 
 
 def main():
@@ -26,17 +27,14 @@ def main():
         b = gpuarray.zeros((size,), dtype=numpy.float32)
         b.fill(1)
 
-        if power > 20:
-            count = 100
-        else:
-            count = 1000
+        count = 100 if power > 20 else 1000
 
         # gpu -----------------------------------------------------------------
         start = drv.Event()
         end = drv.Event()
         start.record()
 
-        for i in range(count):
+        for _i in range(count):
             a + b
 
         end.record()
@@ -57,7 +55,7 @@ def main():
         from time import time
 
         start = time()
-        for i in range(count):
+        for _i in range(count):
             a_cpu + b_cpu
         secs = time() - start
 
diff --git a/test/undistributed/reduction-perf.py b/test/undistributed/reduction-perf.py
index da4a0d03c8a62e78838a8acb202dd9f452ffcf08..e9f604cbf257065cd206bab001fdc6874d6f080e 100644
--- a/test/undistributed/reduction-perf.py
+++ b/test/undistributed/reduction-perf.py
@@ -1,8 +1,9 @@
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-import pycuda.driver as cuda
+from __future__ import annotations
+
 import numpy
 
+import pycuda.driver as cuda
+
 
 def main():
     from pytools import Table
@@ -41,12 +42,12 @@ def main():
                 return result
 
             # warm-up
-            for i in range(3):
+            for _i in range(3):
                 krnl(a_gpu, b_gpu)
 
             cnt = 10
 
-            for i in range(cnt):
+            for _i in range(cnt):
                 krnl(
                     a_gpu,
                     b_gpu,