diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a6b56f8db65e7a1856d82db1e005c788a272d3e5..8123bf042251efad613755f0184f42d7682e293c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
         -   name: "Main Script"
             run: |
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-                . ./prepare-and-run-flake8.sh ./loopy ./test
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples
 
     pylint:
         name: Pylint
@@ -35,7 +35,7 @@ jobs:
                 CONDA_ENVIRONMENT=.test-conda-env.yml
                 USE_CONDA_BUILD=1
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-                . ./prepare-and-run-pylint.sh loopy test/test_*.py
+                . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
 
     pytest3:
         name: Conda Pytest
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 515473e6ab76577a5c47aca6d39ce2f241a5e795..851caaebd4334ac9421c42d60dcaca8d57a812ca 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -89,7 +89,7 @@ Pylint:
   - export PY_EXE=python3
   - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-  - ". ./prepare-and-run-pylint.sh loopy test/test_*.py"
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
   except:
@@ -106,7 +106,7 @@ Documentation:
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh loopy test"
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
   tags:
   - python3
   except:
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index 7ab049cd1906f703b0efc39808ff68a63b91ff37..7f80175ebe82b8412a38708a5b1d32042d8061fe 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -1,7 +1,5 @@
 import numpy as np
 import loopy as lp
-import pyopencl as cl
-import pyopencl.array
 
 knl = lp.make_kernel(
         "{ [i,k]: 0<=i<n and 0<=k<3 }",
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 9098c544490035dd19b960422d1abbb7a5210b68..3458a6e0e989615c3c699831709055a2ec55bde2 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -2,7 +2,7 @@ import numpy as np
 import loopy as lp
 import pyopencl as cl
 import pyopencl.array
-from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
 
 # setup
 # -----
diff --git a/examples/python/rank-one.py b/examples/python/rank-one.py
index eda11fc155fc951246381ca697409615fa0be90a..aa2a650feb165684a9d65207772e093568b9f98e 100644
--- a/examples/python/rank-one.py
+++ b/examples/python/rank-one.py
@@ -34,9 +34,9 @@ split_knl = knl
 
 # PREFETCH1BEGIN
 knl = lp.add_prefetch(knl, "a",
-        fetch_outer_inames='i_outer, i_inner, j_outer, j_inner')
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
 knl = lp.add_prefetch(knl, "b",
-        fetch_outer_inames='i_outer, i_inner, j_outer, j_inner')
+        fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
 # PREFETCH1END
 
 knl = lp.set_options(knl, write_code=True)
@@ -46,11 +46,11 @@ knl = split_knl
 
 # PREFETCH2BEGIN
 knl = lp.add_prefetch(knl, "a", ["i_inner"],
-        fetch_outer_inames='i_outer, j_outer, j_inner',
+        fetch_outer_inames="i_outer, j_outer, j_inner",
         temporary_address_space=lp.AddressSpace.LOCAL,
         default_tag="l.0")
 knl = lp.add_prefetch(knl, "b", ["j_inner"],
-        fetch_outer_inames='i_outer, j_outer, j_inner',
+        fetch_outer_inames="i_outer, j_outer, j_inner",
         temporary_address_space=lp.AddressSpace.LOCAL,
         default_tag="l.0")
 # PREFETCH2END
@@ -67,9 +67,9 @@ knl = lp.split_iname(knl, "j", 256,
         outer_tag="g.1", slabs=(0, 1))
 
 knl = lp.add_prefetch(knl, "a", ["i_inner"],
-        fetch_outer_inames='i_outer, j_outer', default_tag=None)
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)
 knl = lp.add_prefetch(knl, "b", ["j_inner"],
-        fetch_outer_inames='i_outer, j_outer', default_tag=None)
+        fetch_outer_inames="i_outer, j_outer", default_tag=None)
 
 knl = lp.split_iname(knl, "i_inner", 16,
         inner_tag="l.0")
diff --git a/loopy/check.py b/loopy/check.py
index 7e0475d82f321f2dcba6ad3f32657ce64bb4508f..b3b169d7610ea55fc966b2ccfdd5ab0989f1f83c 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -585,7 +585,7 @@ def _get_topological_order(kernel):
 
     for scc in sccs:
         if len(scc) != 1:
-            raise DependencyCycleFound(', '.join(scc))
+            raise DependencyCycleFound(", ".join(scc))
         order.append(scc[0])
 
     return order
@@ -1095,7 +1095,7 @@ def check_that_all_insns_are_scheduled(kernel):
         from loopy.diagnostic import UnscheduledInstructionError
         raise UnscheduledInstructionError(
             "unscheduled instructions: '%s'"
-            % ', '.join(all_schedulable_insns - scheduled_insns))
+            % ", ".join(all_schedulable_insns - scheduled_insns))
 
 # }}}
 
diff --git a/loopy/cli.py b/loopy/cli.py
index cdc24800be0edf3935aacccdd4dc4d9905cf5965..d99cf773104b7464dfe7aa8a18c9867821450d07 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -60,7 +60,7 @@ def main():
 
     parser.add_argument("infile", metavar="INPUT_FILE")
     parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE",
-            help="Defaults to stdout ('-').", nargs='?')
+            help="Defaults to stdout ("-").", nargs="?")
     parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran")
     parser.add_argument("--target", choices=(
         "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"),
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index c0ca875c0e9b661becb1bb0ca6e81139a8a93e2d..bf02131e8c52a5b595377f5bda503d45a4da3028 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -208,7 +208,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
         else:
             printf_args_str = ""
 
-        printf_insn = S("printf(\"%s\\n\"%s)" % (
+        printf_insn = S('printf("%s\\n"%s)' % (
                     printf_format, printf_args_str))
 
         from cgen import Block
diff --git a/loopy/expression.py b/loopy/expression.py
index 8414efaa5dd614d39e93f55aea3836141e5a6d6e..b6b85e281b7f87b5d8d906f82f74d6b96d577f02 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -32,20 +32,20 @@ from loopy.diagnostic import LoopyError
 
 
 # type_context may be:
-# - 'i' for integer -
-# - 'f' for single-precision floating point
-# - 'd' for double-precision floating point
+# - "i" for integer -
+# - "f" for single-precision floating point
+# - "d" for double-precision floating point
 # or None for 'no known context'.
 
 def dtype_to_type_context(target, dtype):
     from loopy.types import NumpyType
 
     if dtype.is_integral():
-        return 'i'
+        return "i"
     if isinstance(dtype, NumpyType) and dtype.dtype in [np.float64, np.complex128]:
-        return 'd'
+        return "d"
     if isinstance(dtype, NumpyType) and dtype.dtype in [np.float32, np.complex64]:
-        return 'f'
+        return "f"
     if target.is_vector_dtype(dtype):
         return dtype_to_type_context(
                 target, NumpyType(dtype.numpy_dtype.fields["x"][0]))
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 40202d4da3319c0ef24b0317f01cd4d31f88d484..81632084bbbedaab3f76859627a460998a926a9c 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -257,9 +257,9 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
     import logging
     console = logging.StreamHandler()
     console.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
     console.setFormatter(formatter)
-    logging.getLogger('fparser').addHandler(console)
+    logging.getLogger("fparser").addHandler(console)
 
     from fparser import api
     tree = api.parse(source, isfree=free_form, isstrict=strict,
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index bc5c51eb0453a34ad902d58903997b75d6c54f34..d363eba722e2f8b45e2f614da7c9d473fe9e0a26 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -198,7 +198,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
             raise ValueError(
                 "unknown scope for nosync option: '%s' "
                 "(allowable scopes are %s)" %
-                (scope, ', '.join("'%s'" % s for s in allowable_scopes)))
+                (scope, ", ".join("'%s'" % s for s in allowable_scopes)))
         return _NosyncParseResult(expr, scope)
 
     for option in options_str.split(","):
@@ -361,7 +361,7 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 
         elif opt_key == "mem_kind":
             opt_value = opt_value.lower().strip()
-            if opt_value not in ['local', 'global']:
+            if opt_value not in ["local", "global"]:
                 raise LoopyError("Unknown memory synchronization type %s specified"
                     " expected, 'local' or 'global'."
                     % opt_value)
@@ -437,13 +437,13 @@ SUBST_RE = re.compile(
 
 def check_illegal_options(insn_options, insn_type):
     illegal_options = []
-    if insn_type not in ['gbarrier', 'lbarrier']:
-        illegal_options.append('mem_kind')
+    if insn_type not in ["gbarrier", "lbarrier"]:
+        illegal_options.append("mem_kind")
 
     bad_options = [x for x in illegal_options if x in insn_options]
     if bad_options:
         raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" %
-                         ', '.join(bad_options), insn_type)
+                         ", ".join(bad_options), insn_type)
 
 
 def parse_insn(groups, insn_options):
@@ -516,7 +516,7 @@ def parse_insn(groups, insn_options):
             assignee_names=assignee_names)
 
     # check for bad options
-    check_illegal_options(insn_options, 'assignment')
+    check_illegal_options(insn_options, "assignment")
 
     insn_id = insn_options.pop("insn_id", None)
     inames_to_dup = insn_options.pop("inames_to_dup", [])
@@ -757,8 +757,8 @@ def parse_instructions(instructions, defines):
 
     insn_options_stack = [get_default_insn_options_dict()]
     if_predicates_stack = [
-            {'predicates': frozenset(),
-                'insn_predicates': frozenset()}]
+            {"predicates": frozenset(),
+                "insn_predicates": frozenset()}]
 
     for insn in instructions:
         if isinstance(insn, InstructionBase):
@@ -819,7 +819,7 @@ def parse_instructions(instructions, defines):
                         insn_options_stack[-1],
                         with_options_match.group("options")))
             # check for bad options
-            check_illegal_options(insn_options_stack[-1], 'with-block')
+            check_illegal_options(insn_options_stack[-1], "with-block")
             continue
 
         for_match = FOR_RE.match(insn)
@@ -859,7 +859,7 @@ def parse_instructions(instructions, defines):
 
             #add to the if_stack
             if_options = options.copy()
-            if_options['insn_predicates'] = options["predicates"]
+            if_options["insn_predicates"] = options["predicates"]
             if_predicates_stack.append(if_options)
             del options
             del predicate
@@ -923,9 +923,9 @@ def parse_instructions(instructions, defines):
         if insn == "end":
             obj = insn_options_stack.pop()
             #if this object is the end of an if statement
-            if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
+            if obj["predicates"] == if_predicates_stack[-1]["insn_predicates"] and\
                     if_predicates_stack[-1]["insn_predicates"] and\
-                    obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
+                    obj["within_inames"] == if_predicates_stack[-1]["within_inames"]:
                 if_predicates_stack.pop()
             continue
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 82e9d36c225fb60d00a725371201d687d9a418e7..8b4c359bdfbae3f8372babc37e3d9ace3701ce73 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -340,8 +340,8 @@ class KernelArgument(ImmutableRecord):
 
         dtype = kwargs.pop("dtype", None)
 
-        if 'for_atomic' in kwargs:
-            for_atomic = kwargs['for_atomic']
+        if "for_atomic" in kwargs:
+            for_atomic = kwargs["for_atomic"]
         else:
             for_atomic = False
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index a22ff505fadfaf46d6ee4886d4cd24561533bed3..4f50a447cbcda2f2e89258310ad73b1417ddb95e 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -687,7 +687,7 @@ class AtomicInit(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'init'
+    op_name = "init"
 
 
 class AtomicUpdate(OrderedAtomic):
@@ -702,7 +702,7 @@ class AtomicUpdate(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'update'
+    op_name = "update"
 
 
 class AtomicLoad(OrderedAtomic):
@@ -716,7 +716,7 @@ class AtomicLoad(OrderedAtomic):
 
         One of the values from :class:`MemoryScope`
     """
-    op_name = 'load'
+    op_name = "load"
 
 # }}}
 
@@ -1380,7 +1380,7 @@ class BarrierInstruction(_DataObliviousInstruction):
         options = self.get_str_options()
         if self.synchronization_kind == "local":
             # add the memory kind
-            options += ['mem_kind={}'.format(self.mem_kind)]
+            options += ["mem_kind={}".format(self.mem_kind)]
         if options:
             first_line += " {%s}" % (": ".join(options))
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 0cf0ff3fb96c0e913e8229615042cb2a3e20ce1c..eb4feb854e5e2ac0f81861e160cbd4d36f8b4215 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -481,7 +481,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
 
     for insn in kernel.instructions:
         if isinstance(insn, MultiAssignmentBase):
-            lhs = ', '.join(str(assignee) for assignee in insn.assignees)
+            lhs = ", ".join(str(assignee) for assignee in insn.assignees)
             op = "%s <- %s" % (lhs, insn.expression)
             if len(op) > 200:
                 op = op[:200] + "..."
@@ -498,7 +498,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
             insn_label = op
             tooltip = insn.id
 
-        lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];"
+        lines.append('"%s" [label="%s",shape="box",tooltip="%s"];'
                 % (
                     insn.id,
                     repr(insn_label)[1:-1],
@@ -542,7 +542,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
 
         for sched_item in kernel.schedule:
             if isinstance(sched_item, EnterLoop):
-                lines.append("subgraph cluster_%s { label=\"%s\""
+                lines.append('subgraph cluster_%s { label="%s"'
                         % (sched_item.iname, sched_item.iname))
             elif isinstance(sched_item, LeaveLoop):
                 lines.append("}")
@@ -1546,8 +1546,8 @@ def stringify_instruction_list(kernel):
             options.append("no_sync_with=%s" % ":".join(
                 "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
         if isinstance(insn, lp.BarrierInstruction) and \
-                insn.synchronization_kind == 'local':
-            options.append('mem_kind=%s' % insn.mem_kind)
+                insn.synchronization_kind == "local":
+            options.append("mem_kind=%s" % insn.mem_kind)
 
         if lhs:
             core = "%s = %s" % (
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index aacce544b35a31359cb535dfeacc46d6e7e2acda..7e1b7af5a818663f1c6e7d56fa93c90bc73ad26c 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -164,11 +164,11 @@ def get_le_neutral(dtype):
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
-            #32 bit integer
+            # 32 bit integer
             return var("INT_MAX")
         elif dtype.numpy_dtype.itemsize == 8:
-            #64 bit integer
-            return var('LONG_MAX')
+            # 64 bit integer
+            return var("LONG_MAX")
     else:
         raise NotImplementedError("less")
 
@@ -182,11 +182,11 @@ def get_ge_neutral(dtype):
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
-            #32 bit integer
+            # 32 bit integer
             return var("INT_MIN")
         elif dtype.numpy_dtype.itemsize == 8:
-            #64 bit integer
-            return var('LONG_MIN')
+            # 64 bit integer
+            return var("LONG_MIN")
     else:
         raise NotImplementedError("less")
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 6ce9d0a9972754d12e249d3ad41d5bdd2746c75c..4cc4fcd23c620b9866aefe0ed481d58bdd28b471 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -447,9 +447,9 @@ def format_insn(kernel, insn_id):
             Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
             format_insn_id(kernel, insn_id))
     elif isinstance(insn, BarrierInstruction):
-        mem_kind = ''
-        if insn.synchronization_kind == 'local':
-            mem_kind = '{mem_kind=%s}' % insn.mem_kind
+        mem_kind = ""
+        if insn.synchronization_kind == "local":
+            mem_kind = "{mem_kind=%s}" % insn.mem_kind
 
         return "[%s] %s... %sbarrier%s%s" % (
                 format_insn_id(kernel, insn_id),
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2a005a73105a675c45c5781d44d0a0a0e816650e..46904aeea417492e38e30b22c897ad13565a06b5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -108,7 +108,7 @@ class GuardedPwQPolynomial(object):
 
     @staticmethod
     def zero():
-        p = isl.PwQPolynomial('{ 0 }')
+        p = isl.PwQPolynomial("{ 0 }")
         return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space))
 
     def __str__(self):
@@ -221,10 +221,10 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
-            filtered_map = mem_map.filter_by(direction=['load'],
-                                             variable=['a','g'])
+            filtered_map = mem_map.filter_by(direction=["load"],
+                                             variable=["a","g"])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -234,8 +234,8 @@ class ToCountMap(object):
         result_map = ToCountMap(val_type=self.val_type)
 
         from loopy.types import to_loopy_type
-        if 'dtype' in kwargs.keys():
-            kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
+        if "dtype" in kwargs.keys():
+            kwargs["dtype"] = [to_loopy_type(d) for d in kwargs["dtype"]]
 
         # for each item in self.count_map
         for self_key, self_val in self.items():
@@ -267,7 +267,7 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
             def filter_func(key):
                 return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4:
@@ -302,29 +302,29 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = get_mem_access_map(knl)
-            grouped_map = mem_map.group_by('mtype', 'dtype', 'direction')
+            grouped_map = mem_map.group_by("mtype", "dtype", "direction")
 
-            f32_global_ld = grouped_map[MemAccess(mtype='global',
+            f32_global_ld = grouped_map[MemAccess(mtype="global",
                                                   dtype=np.float32,
-                                                  direction='load')
+                                                  direction="load")
                                        ].eval_with_dict(params)
-            f32_global_st = grouped_map[MemAccess(mtype='global',
+            f32_global_st = grouped_map[MemAccess(mtype="global",
                                                   dtype=np.float32,
-                                                  direction='store')
+                                                  direction="store")
                                        ].eval_with_dict(params)
-            f32_local_ld = grouped_map[MemAccess(mtype='local',
+            f32_local_ld = grouped_map[MemAccess(mtype="local",
                                                  dtype=np.float32,
-                                                 direction='load')
+                                                 direction="load")
                                       ].eval_with_dict(params)
-            f32_local_st = grouped_map[MemAccess(mtype='local',
+            f32_local_st = grouped_map[MemAccess(mtype="local",
                                                  dtype=np.float32,
-                                                 direction='store')
+                                                 direction="store")
                                       ].eval_with_dict(params)
 
             op_map = get_op_map(knl)
-            ops_dtype = op_map.group_by('dtype')
+            ops_dtype = op_map.group_by("dtype")
 
             f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params)
             f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
@@ -372,20 +372,20 @@ class ToCountMap(object):
             # (first create loopy kernel and specify array data types)
 
             bytes_map = get_mem_access_map(knl).to_bytes()
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
 
             s1_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 1},
-                                direction=['load']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 1},
+                                direction=["load"]).eval_and_sum(params)
             s2_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 2},
-                                direction=['load']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 2},
+                                direction=["load"]).eval_and_sum(params)
             s1_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 1},
-                                direction=['store']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 1},
+                                direction=["store"]).eval_and_sum(params)
             s2_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], lid_strides={0: 2},
-                                direction=['store']).eval_and_sum(params)
+                                mtype=["global"], lid_strides={0: 2},
+                                direction=["store"]).eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
 
@@ -438,10 +438,10 @@ class ToCountMap(object):
 
             # (first create loopy kernel and specify array data types)
 
-            params = {'n': 512, 'm': 256, 'l': 128}
+            params = {"n": 512, "m": 256, "l": 128}
             mem_map = lp.get_mem_access_map(knl)
-            filtered_map = mem_map.filter_by(direction=['load'],
-                                             variable=['a', 'g'])
+            filtered_map = mem_map.filter_by(direction=["load"],
+                                             variable=["a", "g"])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -507,7 +507,7 @@ class Op(Record):
        once per *work-item*, *sub-group*, or *work-group*. The granularities
        allowed can be found in :class:`CountGranularity`, and may be accessed,
        e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
-       of computation executing on a single processor (think 'thread'), a
+       of computation executing on a single processor (think "thread"), a
        collection of which may be grouped together into a work-group. Each
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
@@ -593,7 +593,7 @@ class MemAccess(Record):
        once per *work-item*, *sub-group*, or *work-group*. The granularities
        allowed can be found in :class:`CountGranularity`, and may be accessed,
        e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
-       of computation executing on a single processor (think 'thread'), a
+       of computation executing on a single processor (think "thread"), a
        collection of which may be grouped together into a work-group. Each
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
@@ -725,7 +725,7 @@ class ExpressionOpCounter(CounterBase):
     def map_call(self, expr):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='func:'+str(expr.function),
+                        name="func:"+str(expr.function),
                         count_granularity=CountGranularity.SUBGROUP): 1}
                     ) + self.rec(expr.parameters)
 
@@ -739,7 +739,7 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='add',
+                        name="add",
                         count_granularity=CountGranularity.SUBGROUP):
                      len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
@@ -748,18 +748,18 @@ class ExpressionOpCounter(CounterBase):
         from pymbolic.primitives import is_zero
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
-                                  name='mul',
+                                  name="mul",
                                   count_granularity=CountGranularity.SUBGROUP): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
-                                  name='mul',
+                                  name="mul",
                                   count_granularity=CountGranularity.SUBGROUP): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='div',
+                              name="div",
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
@@ -769,14 +769,14 @@ class ExpressionOpCounter(CounterBase):
 
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='pow',
+                              name="pow",
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='shift',
+                              name="shift",
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
@@ -785,13 +785,13 @@ class ExpressionOpCounter(CounterBase):
 
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='bw',
+                              name="bw",
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='bw',
+                              name="bw",
                               count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
@@ -815,7 +815,7 @@ class ExpressionOpCounter(CounterBase):
 
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
-                              name='maxmin',
+                              name="maxmin",
                               count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
@@ -958,7 +958,7 @@ class LocalMemAccessCounter(MemAccessCounter):
                 if index is None:
                     # no subscript
                     sub_map[MemAccess(
-                                mtype='local',
+                                mtype="local",
                                 dtype=dtype,
                                 count_granularity=CountGranularity.SUBGROUP)
                             ] = 1
@@ -975,7 +975,7 @@ class LocalMemAccessCounter(MemAccessCounter):
                                                 self.knl, array, index_tuple)
 
                 sub_map[MemAccess(
-                        mtype='local',
+                        mtype="local",
                         dtype=dtype,
                         lid_strides=dict(sorted(six.iteritems(lid_strides))),
                         gid_strides=dict(sorted(six.iteritems(gid_strides))),
@@ -1015,7 +1015,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
             # this array is not in global memory
             return ToCountMap()
 
-        return ToCountMap({MemAccess(mtype='global',
+        return ToCountMap({MemAccess(mtype="global",
                                      dtype=self.type_inf(expr), lid_strides={},
                                      gid_strides={}, variable=name,
                                      count_granularity=CountGranularity.WORKITEM): 1}
@@ -1050,7 +1050,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
                                 ) else CountGranularity.SUBGROUP
 
         return ToCountMap({MemAccess(
-                            mtype='global',
+                            mtype="global",
                             dtype=self.type_inf(expr),
                             lid_strides=dict(sorted(six.iteritems(lid_strides))),
                             gid_strides=dict(sorted(six.iteritems(gid_strides))),
@@ -1381,14 +1381,14 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         count operations inside array indices.
 
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
-        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
         made, if this fails an error will be raised. If a :class:`str`
-        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        ``"guess"`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
 
@@ -1407,13 +1407,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         # (first create loopy kernel and specify array data types)
 
         op_map = get_op_map(knl)
-        params = {'n': 512, 'm': 256, 'l': 128}
+        params = {"n": 512, "m": 256, "l": 128}
         f32add = op_map[Op(np.float32,
-                           'add',
+                           "add",
                            count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
         f32mul = op_map[Op(np.float32,
-                           'mul',
+                           "mul",
                            count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
 
@@ -1493,7 +1493,7 @@ def _process_subgroup_size(knl, subgroup_size_requested):
 
         if subgroup_size_requested is None:
             if subgroup_size_guess is None:
-                # 'guess' was not passed and either no target device found
+                # "guess" was not passed and either no target device found
                 # or get_simd_group_size returned None
                 raise ValueError("No sub-group size passed, no target device found. "
                                  "Either (1) pass integer value for subgroup_size, "
@@ -1503,7 +1503,7 @@ def _process_subgroup_size(knl, subgroup_size_requested):
             else:
                 return subgroup_size_guess
 
-        elif subgroup_size_requested == 'guess':
+        elif subgroup_size_requested == "guess":
             if subgroup_size_guess is None:
                 # unable to get subgroup_size from device, so guess
                 subgroup_size_guess = 32
@@ -1539,14 +1539,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or
+    :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or
         *None* that specifies the sub-group size. An OpenCL sub-group is an
         implementation-dependent grouping of work-items within a work-group,
         analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
         counting a :class:`MemAccess` whose count_granularity specifies that it
         should only be counted once per sub-group. If set to *None* an attempt
         to find the sub-group size using the device will be made, if this fails
-        an error will be raised. If a :class:`str` ``'guess'`` is passed as
+        an error will be raised. If a :class:`str` ``"guess"`` is passed as
         the subgroup_size, get_mem_access_map will attempt to find the
         sub-group size using the device and, if unsuccessful, will make a wild
         guess.
@@ -1565,43 +1565,43 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
         # (first create loopy kernel and specify array data types)
 
-        params = {'n': 512, 'm': 256, 'l': 128}
+        params = {"n": 512, "m": 256, "l": 128}
         mem_map = get_mem_access_map(knl)
 
         f32_s1_g_ld_a = mem_map[MemAccess(
-                                    mtype='global',
+                                    mtype="global",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='load',
-                                    variable='a',
+                                    direction="load",
+                                    variable="a",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_g_st_a = mem_map[MemAccess(
-                                    mtype='global',
+                                    mtype="global",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='store',
-                                    variable='a',
+                                    direction="store",
+                                    variable="a",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_l_ld_x = mem_map[MemAccess(
-                                    mtype='local',
+                                    mtype="local",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='load',
-                                    variable='x',
+                                    direction="load",
+                                    variable="x",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
         f32_s1_l_st_x = mem_map[MemAccess(
-                                    mtype='local',
+                                    mtype="local",
                                     dtype=np.float32,
                                     lid_strides={0: 1},
                                     gid_strides={0: 256},
-                                    direction='store',
-                                    variable='x',
+                                    direction="store",
+                                    variable="x",
                                     count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
 
@@ -1691,14 +1691,14 @@ def get_synchronization_map(knl, subgroup_size=None):
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
-        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
         made, if this fails an error will be raised. If a :class:`str`
-        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        ``"guess"`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
 
@@ -1714,8 +1714,8 @@ def get_synchronization_map(knl, subgroup_size=None):
         # (first create loopy kernel and specify array data types)
 
         sync_map = get_synchronization_map(knl)
-        params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
+        params = {"n": 512, "m": 256, "l": 128}
+        barrier_ct = sync_map["barrier_local"].eval_with_dict(params)
 
         # (now use this count to, e.g., predict performance)
 
@@ -1732,7 +1732,7 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     result = ToCountMap()
 
-    one = isl.PwQPolynomial('{ 1 }')
+    one = isl.PwQPolynomial("{ 1 }")
 
     def get_count_poly(iname_list):
         if iname_list:  # (if iname_list is not empty)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 802cc7044bf73d51567e23a6eaac791982709d51..fe9717765d2e7f4f207be3e198170f26997cd022 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -1064,7 +1064,7 @@ def generate_header(kernel, codegen_result=None):
 
     if not isinstance(kernel.target, CFamilyTarget):
         raise LoopyError(
-                'Header generation for non C-based languages are not implemented')
+                "Header generation for non C-based languages are not implemented")
 
     if codegen_result is None:
         from loopy.codegen import generate_code_v2
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 45fad014316511bb852cbf18173fa389e75732c5..9c147b6339317e3f5bcd5df2eb4a6474d3c64874 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -54,7 +54,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     def python_dtype_str(self, dtype):
         if np.dtype(str(dtype)).isbuiltin:
             return "_lpy_np."+dtype.name
-        raise Exception('dtype: {0} not recognized'.format(dtype))
+        raise Exception("dtype: {0} not recognized".format(dtype))
 
     # {{{ handle non numpy arguements
 
@@ -149,7 +149,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             kernel, implemented_data_info):
         gen("for knl in _lpy_c_kernels:")
         with Indentation(gen):
-            gen('knl({args})'.format(
+            gen("knl({args})".format(
                 args=", ".join(args)))
 
     # }}}
@@ -163,7 +163,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
         if options.return_dict:
             gen("return None, {%s}"
-                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    % ", ".join('"%s": %s' % (arg.name, arg.name)
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
                         if arg.base_name in kernel.get_written_variables()))
@@ -211,10 +211,10 @@ class CCompiler(object):
     """
 
     def __init__(self, toolchain=None,
-                 cc='gcc', cflags='-std=c99 -O3 -fPIC'.split(),
-                 ldflags='-shared'.split(), libraries=[],
+                 cc="gcc", cflags="-std=c99 -O3 -fPIC".split(),
+                 ldflags="-shared".split(), libraries=[],
                  include_dirs=[], library_dirs=[], defines=[],
-                 source_suffix='c'):
+                 source_suffix="c"):
         # try to get a default toolchain
         # or subclass supplied version if available
         self.toolchain = toolchain
@@ -225,32 +225,32 @@ class CCompiler(object):
                 # missing compiler python was built with (likely, Conda)
                 # use a default GCCToolchain
                 logger = logging.getLogger(__name__)
-                logger.warn('Default toolchain guessed from python config '
-                            'not found, replacing with default GCCToolchain.')
+                logger.warn("Default toolchain guessed from python config "
+                            "not found, replacing with default GCCToolchain.")
                 # this is ugly, but I'm not sure there's a clean way to copy the
                 # default args
                 self.toolchain = GCCToolchain(
-                    cc='gcc',
-                    cflags='-std=c99 -O3 -fPIC'.split(),
-                    ldflags='-shared'.split(),
+                    cc="gcc",
+                    cflags="-std=c99 -O3 -fPIC".split(),
+                    ldflags="-shared".split(),
                     libraries=[],
                     library_dirs=[],
                     defines=[],
                     undefines=[],
-                    source_suffix='c',
-                    so_ext='.so',
-                    o_ext='.o',
+                    source_suffix="c",
+                    so_ext=".so",
+                    o_ext=".o",
                     include_dirs=[])
 
         if toolchain is None:
             # copy in all differing values
-            diff = {'cc': cc,
-                    'cflags': cflags,
-                    'ldflags': ldflags,
-                    'libraries': libraries,
-                    'include_dirs': include_dirs,
-                    'library_dirs': library_dirs,
-                    'defines': defines}
+            diff = {"cc": cc,
+                    "cflags": cflags,
+                    "ldflags": ldflags,
+                    "libraries": libraries,
+                    "include_dirs": include_dirs,
+                    "library_dirs": library_dirs,
+                    "defines": defines}
             # filter empty and those equal to toolchain defaults
             diff = dict((k, v) for k, v in six.iteritems(diff)
                     if v and (not hasattr(self.toolchain, k) or
@@ -267,7 +267,7 @@ class CCompiler(object):
                      debug_recompile=True):
         """Compile code, build and load shared library."""
         logger.debug(code)
-        c_fname = self._tempname('code.' + self.source_suffix)
+        c_fname = self._tempname("code." + self.source_suffix)
 
         # build object
         _, mod_name, ext_file, recompiled = \
@@ -276,9 +276,9 @@ class CCompiler(object):
                                 debug_recompile, False)
 
         if recompiled:
-            logger.debug('Kernel {0} compiled from source'.format(name))
+            logger.debug("Kernel {0} compiled from source".format(name))
         else:
-            logger.debug('Kernel {0} retrieved from cache'.format(name))
+            logger.debug("Kernel {0} retrieved from cache".format(name))
 
         # and return compiled
         return ctypes.CDLL(ext_file)
@@ -288,10 +288,10 @@ class CPlusPlusCompiler(CCompiler):
     """Subclass of CCompiler to invoke a C++ compiler."""
 
     def __init__(self, toolchain=None,
-                 cc='g++', cflags='-std=c++98 -O3 -fPIC'.split(),
+                 cc="g++", cflags="-std=c++98 -O3 -fPIC".split(),
                  ldflags=[], libraries=[],
                  include_dirs=[], library_dirs=[], defines=[],
-                 source_suffix='cpp'):
+                 source_suffix="cpp"):
 
         super(CPlusPlusCompiler, self).__init__(
             toolchain=toolchain, cc=cc, cflags=cflags, ldflags=ldflags,
@@ -322,8 +322,8 @@ class IDIToCDLL(object):
     def _dtype_to_ctype(self, dtype, pointer=False):
         """Map NumPy dtype to equivalent ctypes type."""
         typename = self.registry.dtype_to_ctype(dtype)
-        typename = {'unsigned': 'uint'}.get(typename, typename)
-        basetype = getattr(ctypes, 'c_' + typename)
+        typename = {"unsigned": "uint"}.get(typename, typename)
+        basetype = getattr(ctypes, "c_" + typename)
         if pointer:
             return ctypes.POINTER(basetype)
         return basetype
@@ -359,7 +359,7 @@ class CompiledCKernel(object):
         """Execute kernel with given args mapped to ctypes equivalents."""
         args_ = []
         for arg, arg_t in zip(args, self._fn.argtypes):
-            if hasattr(arg, 'ctypes'):
+            if hasattr(arg, "ctypes"):
                 if arg.size == 0:
                     # TODO eliminate unused arguments from kernel
                     arg_ = arg_t(0.0)
@@ -406,7 +406,7 @@ class CKernelExecutor(KernelExecutorBase):
 
         dev_code = codegen_result.device_code()
         host_code = codegen_result.host_code()
-        all_code = '\n'.join([dev_code, '', host_code])
+        all_code = "\n".join([dev_code, "", host_code])
 
         if self.kernel.options.write_cl:
             output = all_code
@@ -423,7 +423,7 @@ class CKernelExecutor(KernelExecutorBase):
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
-            all_code = '\n'.join([dev_code, '', host_code])
+            all_code = "\n".join([dev_code, "", host_code])
 
         c_kernels = []
         for dp in codegen_result.device_programs:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index bb9ab6355c2b15ae1435da510567d20643ac4792..df49679a6390ee8ab43041527f10682b1967235d 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -178,7 +178,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
     def map_subscript(self, expr, type_context):
         def base_impl(expr, type_context):
-            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]
+            return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")]
 
         def make_var(name):
             from loopy import TaggedVariable
@@ -226,7 +226,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             base_access = var("read_imagef")(
                     var(ary.name),
                     var("loopy_sampler"),
-                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i')))
+                    var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, "i")))
 
             if ary.dtype.numpy_dtype == np.float32:
                 return base_access.attr("x")
@@ -260,7 +260,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         ary,
                         make_var(access_info.array_name),
                         simplify_using_aff(
-                            self.kernel, self.rec(subscript, 'i')))
+                            self.kernel, self.rec(subscript, "i")))
 
             if access_info.vector_index is not None:
                 return self.codegen_state.ast_builder.add_vector_access(
@@ -295,7 +295,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 return self.make_subscript(
                         arg,
                         var(expr.aggregate.name),
-                        self.rec(offset + expr.index, 'i'))
+                        self.rec(offset + expr.index, "i"))
 
         elif expr.aggregate.name in self.kernel.temporary_variables:
             raise RuntimeError("linear indexing is not supported on temporaries: %s"
@@ -339,13 +339,13 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             else:
                 seen_func("%s_pos_b" % base_func_name)
                 return var("%s_pos_b_%s" % (base_func_name, suffix))(
-                        self.rec(expr.numerator, 'i'),
-                        self.rec(expr.denominator, 'i'))
+                        self.rec(expr.numerator, "i"),
+                        self.rec(expr.denominator, "i"))
         else:
             seen_func(base_func_name)
             return var("%s_%s" % (base_func_name, suffix))(
-                    self.rec(expr.numerator, 'i'),
-                    self.rec(expr.denominator, 'i'))
+                    self.rec(expr.numerator, "i"),
+                    self.rec(expr.denominator, "i"))
 
     def map_floor_div(self, expr, type_context):
         import operator
@@ -684,8 +684,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         if not self.allow_complex:
             return base_impl(expr, type_context)
 
-        n_complex = 'c' == n_dtype.kind
-        d_complex = 'c' == d_dtype.kind
+        n_complex = "c" == n_dtype.kind
+        d_complex = "c" == d_dtype.kind
 
         tgt_dtype = self.infer_type(expr)
 
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index 7e48e1166a13cfbb7b60f909b071f088034ffda1..d1f993daecc03947d9e6e3e60d2a5145ecbf3786 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1
+Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 50fd1026f7bd15ce72915d0d5d5e60f6da4e264c..27422abce85cc6adb329bae9f30e4e36dd9bc06b 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -58,18 +58,18 @@ def _create_vector_types():
     vec.type_to_scalar_and_count = {}
 
     for base_name, base_type, counts in [
-            ('char', np.int8, [1, 2, 3, 4]),
-            ('uchar', np.uint8, [1, 2, 3, 4]),
-            ('short', np.int16, [1, 2, 3, 4]),
-            ('ushort', np.uint16, [1, 2, 3, 4]),
-            ('int', np.int32, [1, 2, 3, 4]),
-            ('uint', np.uint32, [1, 2, 3, 4]),
-            ('long', long_dtype, [1, 2, 3, 4]),
-            ('ulong', ulong_dtype, [1, 2, 3, 4]),
-            ('longlong', np.int64, [1, 2]),
-            ('ulonglong', np.uint64, [1, 2]),
-            ('float', np.float32, [1, 2, 3, 4]),
-            ('double', np.float64, [1, 2]),
+            ("char", np.int8, [1, 2, 3, 4]),
+            ("uchar", np.uint8, [1, 2, 3, 4]),
+            ("short", np.int16, [1, 2, 3, 4]),
+            ("ushort", np.uint16, [1, 2, 3, 4]),
+            ("int", np.int32, [1, 2, 3, 4]),
+            ("uint", np.uint32, [1, 2, 3, 4]),
+            ("long", long_dtype, [1, 2, 3, 4]),
+            ("ulong", ulong_dtype, [1, 2, 3, 4]),
+            ("longlong", np.int64, [1, 2]),
+            ("ulonglong", np.uint64, [1, 2]),
+            ("float", np.float32, [1, 2, 3, 4]),
+            ("double", np.float64, [1, 2]),
             ]:
         for count in counts:
             name = "%s%d" % (base_name, count)
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index c5ccc54f148d704d560d7fe2e61863c215bb2489..a503475d095baf15d644484dd0acac17d7577574 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -234,7 +234,7 @@ class ExecutionWrapperGeneratorBase(object):
                     gen("else:")
                     with Indentation(gen):
                         if not options.no_numpy:
-                            gen("_lpy_offset = getattr(%s, \"offset\", 0)"
+                            gen('_lpy_offset = getattr(%s, "offset", 0)'
                                     % impl_array_name)
                         else:
                             gen("_lpy_offset = %s.offset" % impl_array_name)
@@ -246,7 +246,7 @@ class ExecutionWrapperGeneratorBase(object):
                                     % (arg.name, base_arg.dtype.itemsize))
 
                             gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
-                                    "not divisible by its dtype itemsize\""
+                                    'not divisible by its dtype itemsize"'
                                     % impl_array_name)
                             gen("del _lpy_remdr")
                         else:
@@ -281,7 +281,7 @@ class ExecutionWrapperGeneratorBase(object):
                         with Indentation(gen):
                             gen("raise RuntimeError(\"required stride '%s' for "
                                     "argument '%s' not given or deducible from "
-                                    "passed array\")"
+                                    'passed array")'
                                     % (arg.name, impl_array_name))
 
                         base_arg = kernel.impl_arg_to_arg[impl_array_name]
@@ -292,7 +292,7 @@ class ExecutionWrapperGeneratorBase(object):
                                         base_arg.dtype.dtype.itemsize))
 
                             gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
-                                    " is not divisible by its dtype itemsize\""
+                                    ' is not divisible by its dtype itemsize"'
                                     % (stride_impl_axis, impl_array_name))
                             gen("del _lpy_remdr")
                         else:
@@ -324,7 +324,7 @@ class ExecutionWrapperGeneratorBase(object):
             with Indentation(gen):
                 gen("raise TypeError(\"value argument '%s' "
                         "was not given and could not be automatically "
-                        "determined\")" % arg.name)
+                        'determined")' % arg.name)
 
         gen("# }}}")
         gen("")
@@ -409,7 +409,7 @@ class ExecutionWrapperGeneratorBase(object):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"input argument '%s' must "
-                            "be supplied\")" % arg.name)
+                            'be supplied")' % arg.name)
                     gen("")
 
             if (is_written
@@ -418,14 +418,14 @@ class ExecutionWrapperGeneratorBase(object):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"written image '%s' must "
-                            "be supplied\")" % arg.name)
+                            'be supplied")' % arg.name)
                     gen("")
 
             if is_written and arg.shape is None and not options.skip_arg_checks:
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"written argument '%s' has "
-                            "unknown shape and must be supplied\")" % arg.name)
+                            'unknown shape and must be supplied")' % arg.name)
                     gen("")
 
             possibly_made_by_loopy = False
@@ -468,7 +468,7 @@ class ExecutionWrapperGeneratorBase(object):
                                 kernel_arg.dtype.numpy_dtype)))
                     with Indentation(gen):
                         gen("raise TypeError(\"dtype mismatch on argument '%s' "
-                                "(got: %%s, expected: %s)\" %% %s.dtype)"
+                                '(got: %%s, expected: %s)" %% %s.dtype)'
                                 % (arg.name, arg.dtype, arg.name))
 
                     # {{{ generate shape checking code
@@ -489,7 +489,7 @@ class ExecutionWrapperGeneratorBase(object):
 
                     shape_mismatch_msg = (
                             "raise TypeError(\"shape mismatch on argument '%s' "
-                            "(got: %%s, expected: %%s)\" "
+                            '(got: %%s, expected: %%s)" '
                             "%% (%s.shape, %s))"
                             % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
 
@@ -545,10 +545,10 @@ class ExecutionWrapperGeneratorBase(object):
                                     "if dim > 1)"
                                     % (arg.name, strify_tuple(sym_strides)))
 
-                            gen("raise TypeError(\"strides mismatch on "
+                            gen('raise TypeError("strides mismatch on '
                                     "argument '%s' "
                                     "(after removing unit length dims, "
-                                    "got: %%s, expected: %%s)\" "
+                                    'got: %%s, expected: %%s)" '
                                     "%% (_lpy_got, _lpy_expected))"
                                     % arg.name)
 
@@ -559,7 +559,7 @@ class ExecutionWrapperGeneratorBase(object):
                             gen("raise ValueError(\"Argument '%s' does not "
                                     "allow arrays with offsets. Try passing "
                                     "default_offset=loopy.auto to make_kernel()."
-                                    "\")" % arg.name)
+                                    '")' % arg.name)
                             gen("")
 
             # }}}
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index eb0157bf86d478901fb5a07bbac28aa7a11bcec9..322c771b653a2fd28977538f81e64a63e8984784 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -117,7 +117,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
 
                 subscript, = access_info.subscripts
                 result = var(access_info.array_name)[
-                        var("programIndex") + self.rec(lsize*subscript, 'i')]
+                        var("programIndex") + self.rec(lsize*subscript, "i")]
 
                 if access_info.vector_index is not None:
                     return self.kernel.target.add_vector_access(
@@ -475,7 +475,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
                     "streaming_store(%s + %s, %s)"
                     % (
                         access_info.array_name,
-                        ecm(flattened_sum(new_terms), PREC_NONE, 'i'),
+                        ecm(flattened_sum(new_terms), PREC_NONE, "i"),
                         rhs_code))
 
         # }}}
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 4569be50367b3063999656bcd1de9d76f98e8c0a..f81c05a396c30bb9043d55416f157182beb5085b 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -81,16 +81,16 @@ def _create_vector_types():
     counts = [2, 3, 4, 8, 16]
 
     for base_name, base_type in [
-            ('char', np.int8),
-            ('uchar', np.uint8),
-            ('short', np.int16),
-            ('ushort', np.uint16),
-            ('int', np.int32),
-            ('uint', np.uint32),
-            ('long', np.int64),
-            ('ulong', np.uint64),
-            ('float', np.float32),
-            ('double', np.float64),
+            ("char", np.int8),
+            ("uchar", np.uint8),
+            ("short", np.int16),
+            ("ushort", np.uint16),
+            ("int", np.int32),
+            ("uint", np.uint32),
+            ("long", np.int64),
+            ("ulong", np.uint64),
+            ("float", np.float32),
+            ("double", np.float64),
             ]:
         for count in counts:
             name = "%s%d" % (base_name, count)
@@ -151,16 +151,16 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
 VECTOR_LITERAL_FUNCS = dict(
         ("make_%s%d" % (name, count), (name, dtype, count))
         for name, dtype in [
-            ('char', np.int8),
-            ('uchar', np.uint8),
-            ('short', np.int16),
-            ('ushort', np.uint16),
-            ('int', np.int32),
-            ('uint', np.uint32),
-            ('long', np.int64),
-            ('ulong', np.uint64),
-            ('float', np.float32),
-            ('double', np.float64),
+            ("char", np.int8),
+            ("uchar", np.uint8),
+            ("short", np.int16),
+            ("ushort", np.uint16),
+            ("int", np.int32),
+            ("uint", np.uint32),
+            ("long", np.int64),
+            ("ulong", np.uint64),
+            ("float", np.float32),
+            ("double", np.float64),
             ]
         for count in [2, 3, 4, 8, 16]
         )
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 24b724c6022dade0eba682539096fda1156e0b5c..7ede6e7605652308be676af9b2a069a7495eaf38 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -57,7 +57,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         if dtype.isbuiltin:
             return "_lpy_np."+dtype.name
         else:
-            return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
+            return ('_lpy_cl_tools.get_or_register_dtype("%s")'
                     % cl_tools.dtype_to_ctype(dtype))
 
     # {{{ handle non-numpy args
@@ -222,7 +222,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
         if options.return_dict:
             gen("return _lpy_evt, {%s}"
-                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                    % ", ".join('"%s": %s' % (arg.name, arg.name)
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
                         if arg.base_name in kernel.get_written_variables()))
diff --git a/loopy/tools.py b/loopy/tools.py
index a1cd5e108a45ba60c71b3bb7a51f779b84172065..a93b918f4cf749db55a656acb8522c7daf9d06af 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -40,12 +40,8 @@ import six  # noqa
 from six.moves import intern
 
 
-if six.PY2:
-    def is_integer(obj):
-        return isinstance(obj, (int, long, np.integer))  # noqa pylint:disable=undefined-variable
-else:
-    def is_integer(obj):
-        return isinstance(obj, (int, np.integer))
+def is_integer(obj):
+    return isinstance(obj, (int, np.integer))
 
 
 # {{{ custom KeyBuilder subclass
@@ -317,8 +313,8 @@ def cptr_from_numpy(obj):
 
 
 # https://github.com/hgomersall/pyFFTW/blob/master/pyfftw/utils.pxi#L172
-def empty_aligned(shape, dtype, order='C', n=64):
-    '''empty_aligned(shape, dtype='float64', order='C', n=None)
+def empty_aligned(shape, dtype, order="C", n=64):
+    """empty_aligned(shape, dtype='float64', order="C", n=None)
     Function that returns an empty numpy array that is n-byte aligned,
     where ``n`` is determined by inspecting the CPU if it is not
     provided.
@@ -326,7 +322,7 @@ def empty_aligned(shape, dtype, order='C', n=64):
     ``n`` is not provided then this function will inspect the CPU to
     determine alignment. The rest of the arguments are as per
     :func:`numpy.empty`.
-    '''
+    """
     itemsize = np.dtype(dtype).itemsize
 
     # Apparently there is an issue with numpy.prod wrapping around on 32-bits
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 6c7cb3365991cf92db4c0fa2a56a07e9ad07f66d..905c1e64ab96b039bce5e451da71ce1f73792e0e 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -723,7 +723,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
     from loopy.match import re_from_glob
     new_iname_to_tag = {}
     for iname, new_tag in iname_to_tag:
-        if '*' in iname or '?' in iname:
+        if "*" in iname or "?" in iname:
             match_re = re_from_glob(iname)
             for sub_iname in all_inames:
                 if match_re.match(sub_iname):
diff --git a/setup.cfg b/setup.cfg
index a0d95746e1a399d6a2d7c315bffc9b834d2f5487..9495d106cf389d485037db16a35a14b4aaf6c873 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,3 +4,7 @@ max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
     loopy/target/c/compyte/array.py
+
+inline-quotes = "
+docstring-quotes = """
+multiline-quotes = """
diff --git a/setup.py b/setup.py
index c041ba2dad331d44ae34ea7959df32de05ec807b..497fa60ba09acc34a24e77bd17bf56a69b9490e7 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@ finally:
     version_file.close()
 
 os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
-exec(compile(version_file_contents, "loopy/version.py", 'exec'), ver_dic)
+exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic)
 
 
 # {{{ capture git revision at install time
@@ -56,7 +56,7 @@ def write_git_revision(package_name):
     git_rev = find_git_revision(dn)
 
     with open(join(dn, package_name, "_git_rev.py"), "w") as outf:
-        outf.write("GIT_REVISION = %s\n" % repr(git_rev))
+        outf.write('GIT_REVISION = "%s"\n' % git_rev)
 
 
 write_git_revision("loopy")
@@ -69,20 +69,20 @@ setup(name="loo.py",
       description="A code generator for array-based code on CPUs and GPUs",
       long_description=open("README.rst", "rt").read(),
       classifiers=[
-          'Development Status :: 4 - Beta',
-          'Intended Audience :: Developers',
-          'Intended Audience :: Other Audience',
-          'Intended Audience :: Science/Research',
-          'License :: OSI Approved :: MIT License',
-          'Natural Language :: English',
-          'Programming Language :: Python',
-          'Programming Language :: Python :: 3',
-          'Topic :: Scientific/Engineering',
-          'Topic :: Scientific/Engineering :: Information Analysis',
-          'Topic :: Scientific/Engineering :: Mathematics',
-          'Topic :: Scientific/Engineering :: Visualization',
-          'Topic :: Software Development :: Libraries',
-          'Topic :: Utilities',
+          "Development Status :: 4 - Beta",
+          "Intended Audience :: Developers",
+          "Intended Audience :: Other Audience",
+          "Intended Audience :: Science/Research",
+          "License :: OSI Approved :: MIT License",
+          "Natural Language :: English",
+          "Programming Language :: Python",
+          "Programming Language :: Python :: 3",
+          "Topic :: Scientific/Engineering",
+          "Topic :: Scientific/Engineering :: Information Analysis",
+          "Topic :: Scientific/Engineering :: Mathematics",
+          "Topic :: Scientific/Engineering :: Visualization",
+          "Topic :: Software Development :: Libraries",
+          "Topic :: Utilities",
           ],
 
       python_requires="~=3.6",
diff --git a/test/test_apps.py b/test/test_apps.py
index f7eeb756e735ffb4d5ab6ab747c6bb792c690668..ed5e5ce3d25bd37c92d400021422b362c3a5e28f 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -47,7 +47,7 @@ from loopy.diagnostic import LoopyError
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -102,10 +102,10 @@ def test_convolution(ctx_factory):
         knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
         knl = lp.tag_inames(knl, dict(ifeat="g.2"))
         knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]",
-                fetch_outer_inames='im_x_outer, im_y_outer, ifeat',
+                fetch_outer_inames="im_x_outer, im_y_outer, ifeat",
                 default_tag="l.auto")
         knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y",
-                fetch_outer_inames='iimg, im_x_outer, im_y_outer, ifeat, icolor',
+                fetch_outer_inames="iimg, im_x_outer, im_y_outer, ifeat, icolor",
                 default_tag="l.auto")
         return knl
 
@@ -592,12 +592,12 @@ def test_poisson_fem(ctx_factory):
     knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])
 
     def variant_1(knl):
-        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
+        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag="for")
         knl = lp.prioritize_loops(knl, "c,i,j")
         return knl
 
     def variant_2(knl):
-        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
+        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag="for")
         knl = lp.prioritize_loops(knl, "c,i,j")
         return knl
 
@@ -633,10 +633,10 @@ def test_domain_tree_nesting():
 
     TV = lp.TemporaryVariable  # noqa
 
-    knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
-                    '{[j]: 0 <= j < 100}',
-                    '{[a_count]: 0 <= a_count < a_end}',
-                    '{[b_count]: 0 <= b_count < b_end}'],
+    knl = lp.make_kernel(["{[i]: 0 <= i < 12}",
+                    "{[j]: 0 <= j < 100}",
+                    "{[a_count]: 0 <= a_count < a_end}",
+                    "{[b_count]: 0 <= b_count < b_end}"],
     """
     for j
         for i
@@ -655,15 +655,15 @@ def test_domain_tree_nesting():
     end
     """,
     [
-        TV('out_map', initializer=out_map, read_only=True, address_space=AS.PRIVATE),
-        TV('if_val', initializer=if_val, read_only=True, address_space=AS.PRIVATE),
-        TV('vals', initializer=vals, read_only=True, address_space=AS.PRIVATE),
-        TV('num_vals', initializer=num_vals, read_only=True,
+        TV("out_map", initializer=out_map, read_only=True, address_space=AS.PRIVATE),
+        TV("if_val", initializer=if_val, read_only=True, address_space=AS.PRIVATE),
+        TV("vals", initializer=vals, read_only=True, address_space=AS.PRIVATE),
+        TV("num_vals", initializer=num_vals, read_only=True,
            address_space=AS.PRIVATE),
-        TV('num_vals_offset', initializer=num_vals_offset, read_only=True,
+        TV("num_vals_offset", initializer=num_vals_offset, read_only=True,
            address_space=AS.PRIVATE),
-        lp.GlobalArg('B', shape=(100, 31), dtype=np.float64),
-        lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)])
+        lp.GlobalArg("B", shape=(100, 31), dtype=np.float64),
+        lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)])
 
     parents_per_domain = knl.parents_per_domain()
 
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index b0ca7ade25d3077c7f868f366cb9ff6bb011af33..53fb80be784b4b89c2740daa80004d59ae0f2e97 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -63,29 +63,29 @@ def test_c_target():
 def test_c_target_strides():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
+    def __get_kernel(order="C"):
         return lp.make_kernel(
                 "{ [i,j]: 0<=i,j<n }",
                 "out[i, j] = 2*a[i, j]",
                 [
-                    lp.GlobalArg("out", np.float32, shape=('n', 'n'), order=order),
-                    lp.GlobalArg("a", np.float32, shape=('n', 'n'), order=order),
+                    lp.GlobalArg("out", np.float32, shape=("n", "n"), order=order),
+                    lp.GlobalArg("a", np.float32, shape=("n", "n"), order=order),
                     "..."
                     ],
                 target=ExecutableCTarget())
 
     # test with C-order
-    knl = __get_kernel('C')
+    knl = __get_kernel("C")
     a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1),
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
 
     # test with F-order
-    knl = __get_kernel('F')
+    knl = __get_kernel("F")
     a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1),
-                      order='F')
+                      order="F")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
@@ -94,18 +94,18 @@ def test_c_target_strides():
 def test_c_target_strides_nonsquare():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
-        indicies = ['i', 'j', 'k']
+    def __get_kernel(order="C"):
+        indicies = ["i", "j", "k"]
         sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
         # create domain strings
-        domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}'
+        domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
         for idx, size in zip(indicies, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
-        statement = 'out[{indexed}] = 2 * a[{indexed}]'.format(
-            indexed=', '.join(indicies))
+        statement = "out[{indexed}] = 2 * a[{indexed}]".format(
+            indexed=", ".join(indicies))
         return lp.make_kernel(
                 domains,
                 statement,
@@ -117,21 +117,21 @@ def test_c_target_strides_nonsquare():
                 target=ExecutableCTarget())
 
     # test with C-order
-    knl = __get_kernel('C')
-    a_lp = next(x for x in knl.args if x.name == 'a')
+    knl = __get_kernel("C")
+    a_lp = next(x for x in knl.args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
 
     # test with F-order
-    knl = __get_kernel('F')
-    a_lp = next(x for x in knl.args if x.name == 'a')
+    knl = __get_kernel("F")
+    a_lp = next(x for x in knl.args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
-                      order='F')
+                      order="F")
 
     assert np.allclose(knl(a=a_np)[1],
                 2 * a_np)
@@ -140,18 +140,18 @@ def test_c_target_strides_nonsquare():
 def test_c_optimizations():
     from loopy.target.c import ExecutableCTarget
 
-    def __get_kernel(order='C'):
-        indicies = ['i', 'j', 'k']
+    def __get_kernel(order="C"):
+        indicies = ["i", "j", "k"]
         sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
         # create domain strings
-        domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}'
+        domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
         for idx, size in zip(indicies, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
-        statement = 'out[{indexed}] = 2 * a[{indexed}]'.format(
-            indexed=', '.join(indicies))
+        statement = "out[{indexed}] = 2 * a[{indexed}]".format(
+            indexed=", ".join(indicies))
         return lp.make_kernel(
                 domains,
                 statement,
@@ -163,20 +163,20 @@ def test_c_optimizations():
                 target=ExecutableCTarget()), sizes
 
     # test with ILP
-    knl, sizes = __get_kernel('C')
-    knl = lp.split_iname(knl, 'i', 4, inner_tag='ilp')
+    knl, sizes = __get_kernel("C")
+    knl = lp.split_iname(knl, "i", 4, inner_tag="ilp")
     a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32),
                       sizes,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1], 2 * a_np)
 
     # test with unrolling
-    knl, sizes = __get_kernel('C')
-    knl = lp.split_iname(knl, 'i', 4, inner_tag='unr')
+    knl, sizes = __get_kernel("C")
+    knl = lp.split_iname(knl, "i", 4, inner_tag="unr")
     a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32),
                       sizes,
-                      order='C')
+                      order="C")
 
     assert np.allclose(knl(a=a_np)[1], 2 * a_np)
 
@@ -186,13 +186,13 @@ def test_function_decl_extractor():
     # in execution
     from loopy.target.c import ExecutableCTarget
 
-    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
         """
             a[i] = b[i] + v
         """,
-        [lp.GlobalArg('a', shape=(10,), dtype=np.int32),
-         lp.ConstantArg('b', shape=(10)),
-         lp.ValueArg('v', dtype=np.int32)],
+        [lp.GlobalArg("a", shape=(10,), dtype=np.int32),
+         lp.ConstantArg("b", shape=(10)),
+         lp.ValueArg("v", dtype=np.int32)],
         target=ExecutableCTarget())
 
     assert np.allclose(knl(b=np.arange(10), v=-1)[1], np.arange(10) - 1)
@@ -245,14 +245,14 @@ def test_c_caching():
             return self.buffer.getvalue()
 
     def __get_knl():
-        return lp.make_kernel('{[i]: 0 <= i < 10}',
+        return lp.make_kernel("{[i]: 0 <= i < 10}",
         """
             a[i] = b[i]
         """,
-        [lp.GlobalArg('a', shape=(10,), dtype=np.int32),
-         lp.ConstantArg('b', shape=(10))],
+        [lp.GlobalArg("a", shape=(10,), dtype=np.int32),
+         lp.ConstantArg("b", shape=(10))],
                              target=ExecutableCTarget(),
-                             name='cache_test')
+                             name="cache_test")
 
     knl = __get_knl()
     # compile
@@ -267,7 +267,7 @@ def test_c_caching():
     # and get logs
     logs = tl.stop_capture()
     # check that we didn't recompile
-    assert 'Kernel cache_test retrieved from cache' in logs
+    assert "Kernel cache_test retrieved from cache" in logs
 
 
 def test_c_execution_with_global_temporaries():
@@ -278,12 +278,12 @@ def test_c_execution_with_global_temporaries():
     AS = lp.AddressSpace        # noqa
     n = 10
 
-    knl = lp.make_kernel('{[i]: 0 <= i < n}',
+    knl = lp.make_kernel("{[i]: 0 <= i < n}",
         """
             a[i] = b[i]
         """,
-        [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
-         lp.TemporaryVariable('b', shape=(n,),
+        [lp.GlobalArg("a", shape=(n,), dtype=np.int32),
+         lp.TemporaryVariable("b", shape=(n,),
                               initializer=np.arange(n, dtype=np.int32),
                               dtype=np.int32,
                               read_only=True,
@@ -291,7 +291,7 @@ def test_c_execution_with_global_temporaries():
         target=ExecutableCTarget())
 
     knl = lp.fix_parameters(knl, n=n)
-    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
+    assert ("int b[%d]" % n) not in lp.generate_code_v2(knl).host_code()
     assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
 
 
@@ -303,12 +303,12 @@ def test_missing_compilers():
     def __test(evalfunc, target, **targetargs):
         n = 10
 
-        knl = lp.make_kernel('{[i]: 0 <= i < n}',
+        knl = lp.make_kernel("{[i]: 0 <= i < n}",
             """
                 a[i] = b[i]
             """,
-            [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
-             lp.GlobalArg('b', shape=(n,), dtype=np.int32)],
+            [lp.GlobalArg("a", shape=(n,), dtype=np.int32),
+             lp.GlobalArg("b", shape=(n,), dtype=np.int32)],
             target=target(**targetargs))
 
         knl = lp.fix_parameters(knl, n=n)
@@ -327,7 +327,7 @@ def test_missing_compilers():
     try:
         # test with path wiped out such that we can't find gcc
         with pytest.raises(ExecError):
-            os.environ["PATH"] = ''
+            os.environ["PATH"] = ""
             ccomp = CCompiler()
             __test(eval_tester, ExecutableCTarget, compiler=ccomp)
     finally:
@@ -343,9 +343,9 @@ def test_missing_compilers():
         __test(eval_tester, ExecutableCTarget, compiler=ccomp)
 
     # next test that some made up compiler can be specified
-    ccomp = CCompiler(cc='foo')
+    ccomp = CCompiler(cc="foo")
     assert isinstance(ccomp.toolchain, GCCToolchain)
-    assert ccomp.toolchain.cc == 'foo'
+    assert ccomp.toolchain.cc == "foo"
 
     # and that said made up compiler errors out
 
diff --git a/test/test_dg.py b/test/test_dg.py
index 543701a5fb4f2ce8c40851117573d1f72639436c..94760fd6dded8a3b13bdf81488d4006c49aeb817 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -100,7 +100,7 @@ def test_dg_volume(ctx_factory):
         knl = lp.tag_inames(knl, dict(n="l.0"))
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         knl = lp.add_prefetch(knl, "DrDsDt[:,:]",
-                fetch_outer_inames='k_outer',
+                fetch_outer_inames="k_outer",
                 default_tag="l.auto")
         return knl
 
diff --git a/test/test_domain.py b/test/test_domain.py
index 8962514450f8ee352089104b2ffc1241e323725d..7408b338b9c95546cbeac4d2c80986e1333f17b6 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -330,7 +330,7 @@ def test_equality_constraints(ctx_factory):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
 
-    knl = lp.add_inames_to_insn(knl, 'j_inner, j_outer', 'id:set_b')
+    knl = lp.add_inames_to_insn(knl, "j_inner, j_outer", "id:set_b")
 
     #print(knl)
     #print(knl.domains[0].detect_equalities())
diff --git a/test/test_expression.py b/test/test_expression.py
index 41a8de656efcfc44fe404fa4722572d36c974409..50f143ff6fc294b93ccead1fe5d9391cca46bbb8 100644
--- a/test/test_expression.py
+++ b/test/test_expression.py
@@ -51,7 +51,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -381,8 +381,8 @@ def test_sci_notation_literal(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     set_kernel = lp.make_kernel(
-         ''' { [i]: 0<=i<12 } ''',
-         ''' out[i] = 1e-12''')
+         """ { [i]: 0<=i<12 } """,
+         """ out[i] = 1e-12""")
 
     set_kernel = lp.set_options(set_kernel, write_cl=True)
 
@@ -396,8 +396,8 @@ def test_indexof(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<5 } ''',
-         ''' out[i,j] = indexof(out[i,j])''')
+         """ { [i,j]: 0<=i,j<5 } """,
+         """ out[i,j] = indexof(out[i,j])""")
 
     knl = lp.set_options(knl, write_cl=True)
 
@@ -420,8 +420,8 @@ def test_indexof_vec(ctx_factory):
         pytest.skip("target ICD miscompiles vector code")
 
     knl = lp.make_kernel(
-         ''' { [i,j,k]: 0<=i,j,k<4 } ''',
-         ''' out[i,j,k] = indexof_vec(out[i,j,k])''')
+         """ { [i,j,k]: 0<=i,j,k<4 } """,
+         """ out[i,j,k] = indexof_vec(out[i,j,k])""")
 
     knl = lp.tag_inames(knl, {"i": "vec"})
     knl = lp.tag_data_axes(knl, "out", "vec,c,c")
@@ -479,7 +479,7 @@ def test_divide_precedence(ctx_factory):
             x[0] = c*(a/b)
             y[0] = c*(a%b)
             """,
-            [lp.ValueArg('a, b, c', np.int32), lp.GlobalArg('x, y', np.int32)])
+            [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)])
     print(lp.generate_code_v2(knl).device_code())
 
     evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5)
diff --git a/test/test_fortran.py b/test/test_fortran.py
index b9db35c4715d08faae633f6505e1b32dbe110f13..56e88733ab2caa6273bebf5d86be356cc59e6836 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -38,7 +38,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -317,10 +317,10 @@ def test_matmul(ctx_factory, buffer_inames):
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
     knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
-            precompute_outer_inames='i_outer, j_outer, k_outer',
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
     knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
-            precompute_outer_inames='i_outer, j_outer, k_outer',
+            precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 390c5654fc0ee5bae631d26e5a0f58e939f8c78b..002a705861f2a38eb35cf84a56e836bdfcebdb21 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -148,7 +148,7 @@ def test_transpose(ctx_factory):
             outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16,
             outer_tag="g.1", inner_tag="l.0")
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"],
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -316,9 +316,9 @@ def test_rank_one(ctx_factory):
                 outer_tag="g.1", inner_tag="l.1")
 
         knl = lp.add_prefetch(knl, "a",
-                fetch_outer_inames='i_outer, i_inner, j_outer, j_inner')
+                fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
         knl = lp.add_prefetch(knl, "b",
-                fetch_outer_inames='i_outer, i_inner, j_outer, j_inner')
+                fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
         return knl
 
     def variant_3(knl):
@@ -328,11 +328,11 @@ def test_rank_one(ctx_factory):
                 outer_tag="g.1", inner_tag="l.1")
 
         knl = lp.add_prefetch(knl, "a", ["i_inner"],
-                    fetch_outer_inames='i_outer, j_outer, j_inner',
+                    fetch_outer_inames="i_outer, j_outer, j_inner",
                     temporary_address_space=lp.AddressSpace.LOCAL,
                     default_tag="l.auto")
         knl = lp.add_prefetch(knl, "b", ["j_inner"],
-                    fetch_outer_inames='i_outer, j_outer, j_inner',
+                    fetch_outer_inames="i_outer, j_outer, j_inner",
                     temporary_address_space=lp.AddressSpace.LOCAL,
                     default_tag="l.auto")
 
@@ -345,9 +345,9 @@ def test_rank_one(ctx_factory):
                 outer_tag="g.1", slabs=(0, 1))
 
         knl = lp.add_prefetch(knl, "a", ["i_inner"],
-                fetch_outer_inames='i_outer, j_outer', default_tag=None)
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
         knl = lp.add_prefetch(knl, "b", ["j_inner"],
-                fetch_outer_inames='i_outer, j_outer', default_tag=None)
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
 
         knl = lp.split_iname(knl, "i_inner", 16,
                 inner_tag="l.0")
@@ -403,8 +403,8 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1")
     knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
     knl = lp.split_iname(knl, "k", 16)
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -444,17 +444,17 @@ def test_intel_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "k", 16)
     #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr")
 
-    knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "a", ["i_inner_inner", "k_inner", "i_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "b", ["j_inner_inner", "k_inner", "j_inner_outer"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     # FIXME: Grouped prefetch
-    #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")],
+    #knl = lp.add_prefetch(knl, "a", ["k_inner", ("i_inner_inner", "i_inner_outer")],
     #           default_tag="l.auto")
-    #knl = lp.add_prefetch(knl, 'b',
+    #knl = lp.add_prefetch(knl, "b",
     # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto")
 
     #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
@@ -506,9 +506,9 @@ def test_magma_fermi_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "k", 16)
     knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr")
     # FIXME
-    #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"],
+    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner_inner", "i_inner_outer"],
     #           default_tag="l.auto")
-    #knl = lp.add_prefetch(knl, 'b',
+    #knl = lp.add_prefetch(knl, "b",
     #    ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -550,11 +550,11 @@ def test_image_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 32)
     # conflict-free
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -600,8 +600,8 @@ def no_test_image_matrix_mul_ilp(ctx_factory):
             outer_tag="ilp", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 2)
     # conflict-free?
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"],
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "b", ["j_inner_outer", "j_inner_inner", "k_inner"],
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -634,11 +634,11 @@ def test_fancy_matrix_mul(ctx_factory):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 16, slabs=(0, 1))
-    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
-    knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"],
-            fetch_outer_inames='i_outer, j_outer, k_outer',
+    knl = lp.add_prefetch(knl, "b", ["k_inner", "j_inner"],
+            fetch_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
@@ -670,7 +670,7 @@ def test_small_batched_matvec(ctx_factory):
     seq_knl = knl
 
     align_bytes = 64
-    knl = lp.add_prefetch(knl, 'd[:,:]', default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "d[:,:]", default_tag="l.auto")
     pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes)
     knl = lp.split_array_dim(knl, ("f", 0), pad_mult)
     knl = lp.add_padding(knl, "f", 0, align_bytes)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 0875c121c0f02a51754305d10a0c5388b36c3262..de400482380e21bdff5d63782894d92133e7498c 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -48,7 +48,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -68,9 +68,9 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory):
             out[ii] = 2*out[ii]+cnst[ii]{id=second}
             """,
             [lp.TemporaryVariable(
-                'cnst', initializer=cnst,
+                "cnst", initializer=cnst,
                 scope=lp.AddressSpace.GLOBAL,
-                read_only=True), '...'])
+                read_only=True), "..."])
     knl = lp.fix_parameters(knl, n=16)
     knl = lp.add_barrier(knl, "id:first", "id:second")
 
@@ -849,8 +849,8 @@ def test_auto_test_zero_warmup_rounds(ctx_factory):
 
 def test_variable_size_temporary():
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' out[i] = sum(j, a[i,j])''')
+         """{ [i,j]: 0<=i,j<n }""",
+         """out[i] = sum(j, a[i,j])""")
 
     knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
 
@@ -936,7 +936,7 @@ def test_atomic_load(ctx_factory, dtype):
                 lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
                 lp.GlobalArg("a", dtype, shape=lp.auto),
                 lp.GlobalArg("b", dtype, shape=lp.auto),
-                lp.TemporaryVariable('temp', dtype, for_atomic=True,
+                lp.TemporaryVariable("temp", dtype, for_atomic=True,
                                      address_space=AddressSpace.LOCAL),
                 "..."
                 ],
@@ -961,7 +961,7 @@ def test_atomic_init(dtype):
                 "..."
                 ],
             silenced_warnings=["write_race(init)"])
-    knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0')
+    knl = lp.split_iname(knl, "i", vec_width, inner_tag="l.0")
     print(knl)
     print(lp.generate_code_v2(knl).device_code())
 
@@ -997,7 +997,7 @@ def test_within_inames_and_reduction():
 
     k = lp.preprocess_kernel(k)
 
-    assert 'i' not in k.insn_inames("insn_0_j_update")
+    assert "i" not in k.insn_inames("insn_0_j_update")
     print(k.stringify(with_dependencies=True))
 
 
@@ -1021,9 +1021,9 @@ def test_literal_local_barrier(ctx_factory):
 
 def test_local_barrier_mem_kind():
     def _test_type(mtype, expected):
-        insn = '... lbarrier'
+        insn = "... lbarrier"
         if mtype:
-            insn += '{mem_kind=%s}' % mtype
+            insn += "{mem_kind=%s}" % mtype
         knl = lp.make_kernel(
                 "{ [i]: 0<=i<n }",
                 """
@@ -1034,11 +1034,11 @@ def test_local_barrier_mem_kind():
                 target=lp.PyOpenCLTarget())
 
         cgr = lp.generate_code_v2(knl)
-        assert 'barrier(%s)' % expected in cgr.device_code()
+        assert "barrier(%s)" % expected in cgr.device_code()
 
-    _test_type('', 'CLK_LOCAL_MEM_FENCE')
-    _test_type('global', 'CLK_GLOBAL_MEM_FENCE')
-    _test_type('local', 'CLK_LOCAL_MEM_FENCE')
+    _test_type("", "CLK_LOCAL_MEM_FENCE")
+    _test_type("global", "CLK_GLOBAL_MEM_FENCE")
+    _test_type("local", "CLK_LOCAL_MEM_FENCE")
 
 
 def test_kernel_splitting(ctx_factory):
@@ -1774,7 +1774,7 @@ def test_index_cse(ctx_factory):
 def test_ilp_and_conditionals(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond = T[k] < 0.5
@@ -1789,7 +1789,7 @@ def test_ilp_and_conditionals(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')
+    knl = lp.split_iname(knl, "k", 2, inner_tag="ilp")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
@@ -1797,7 +1797,7 @@ def test_ilp_and_conditionals(ctx_factory):
 def test_unr_and_conditionals(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond[k] = T[k] < 0.5
@@ -1812,7 +1812,7 @@ def test_unr_and_conditionals(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')
+    knl = lp.split_iname(knl, "k", 2, inner_tag="unr")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
@@ -1820,7 +1820,7 @@ def test_unr_and_conditionals(ctx_factory):
 def test_constant_array_args(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              <> Tcond[k] = T[k] < 0.5
@@ -1829,8 +1829,8 @@ def test_constant_array_args(ctx_factory):
              end
          end
          """,
-         [lp.ConstantArg('T', shape=(200,), dtype=np.float32),
-         '...'])
+         [lp.ConstantArg("T", shape=(200,), dtype=np.float32),
+         "..."])
 
     knl = lp.fix_parameters(knl, n=200)
 
@@ -1892,33 +1892,33 @@ def test_const_temp_with_initializer_not_saved():
 
 
 def test_header_extract():
-    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+    knl = lp.make_kernel("{[k]: 0<=k<n}}",
          """
          for k
              T[k] = k**2
          end
          """,
-         [lp.GlobalArg('T', shape=(200,), dtype=np.float32),
-         '...'])
+         [lp.GlobalArg("T", shape=(200,), dtype=np.float32),
+         "..."])
 
     knl = lp.fix_parameters(knl, n=200)
 
     #test C
     cknl = knl.copy(target=lp.CTarget())
     assert str(lp.generate_header(cknl)[0]) == (
-            'void loopy_kernel(float *__restrict__ T);')
+            "void loopy_kernel(float *__restrict__ T);")
 
     #test CUDA
     cuknl = knl.copy(target=lp.CudaTarget())
     assert str(lp.generate_header(cuknl)[0]) == (
             'extern "C" __global__ void __launch_bounds__(1) '
-            'loopy_kernel(float *__restrict__ T);')
+            "loopy_kernel(float *__restrict__ T);")
 
     #test OpenCL
     oclknl = knl.copy(target=lp.PyOpenCLTarget())
     assert str(lp.generate_header(oclknl)[0]) == (
-            '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
-            'loopy_kernel(__global float *__restrict__ T);')
+            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) "
+            "loopy_kernel(__global float *__restrict__ T);")
 
 
 def test_scalars_with_base_storage(ctx_factory):
@@ -2108,37 +2108,37 @@ def test_integer_reduction(ctx_factory):
     n = 200
     for vtype in [np.int32, np.int64]:
         var_int = np.random.randint(1000, size=n).astype(vtype)
-        var_lp = lp.TemporaryVariable('var', initializer=var_int,
+        var_lp = lp.TemporaryVariable("var", initializer=var_int,
                                    read_only=True,
                                    address_space=lp.AddressSpace.PRIVATE,
                                    dtype=to_loopy_type(vtype),
                                    shape=lp.auto)
 
         from collections import namedtuple
-        ReductionTest = namedtuple('ReductionTest', 'kind, check, args')
+        ReductionTest = namedtuple("ReductionTest", "kind, check, args")
 
         reductions = [
-            ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'),
-            ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'),
-            ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'),
-            ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'),
-            ReductionTest('argmax',
+            ReductionTest("max", lambda x: x == np.max(var_int), args="var[k]"),
+            ReductionTest("min", lambda x: x == np.min(var_int), args="var[k]"),
+            ReductionTest("sum", lambda x: x == np.sum(var_int), args="var[k]"),
+            ReductionTest("product", lambda x: x == np.prod(var_int), args="var[k]"),
+            ReductionTest("argmax",
                 lambda x: (
                     x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)),
-                args='var[k], k'),
-            ReductionTest('argmin',
+                args="var[k], k"),
+            ReductionTest("argmin",
                 lambda x: (
                     x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)),
-                args='var[k], k')
+                args="var[k], k")
         ]
 
         for reduction, function, args in reductions:
-            kstr = ("out" if 'arg' not in reduction
+            kstr = ("out" if "arg" not in reduction
                         else "out[0], out[1]")
-            kstr += ' = {0}(k, {1})'.format(reduction, args)
-            knl = lp.make_kernel('{[k]: 0<=k<n}',
+            kstr += " = {0}(k, {1})".format(reduction, args)
+            knl = lp.make_kernel("{[k]: 0<=k<n}",
                                 kstr,
-                                [var_lp, '...'])
+                                [var_lp, "..."])
 
             knl = lp.fix_parameters(knl, n=200)
 
@@ -2292,7 +2292,7 @@ def test_barrier_insertion_near_bottom_of_loop():
 
 def test_barrier_in_overridden_get_grid_size_expanded_kernel():
     # make simple barrier'd kernel
-    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
                    """
               for i
                     a[i] = i {id=a}
@@ -2300,14 +2300,14 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
                     b[i + 1] = a[i] {nosync=a}
               end
                    """,
-                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
+                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order="C",
                                          address_space=lp.AddressSpace.LOCAL),
-                    lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
+                    lp.GlobalArg("b", np.float32, shape=(11,), order="C")],
                seq_dependencies=True)
 
     # split into kernel w/ vesize larger than iname domain
     vecsize = 16
-    knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
+    knl = lp.split_iname(knl, "i", vecsize, inner_tag="l.0")
 
     from testlib import GridOverride
 
@@ -2409,7 +2409,7 @@ def test_struct_assignment(ctx_factory):
 
     bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct(
             ctx.devices[0], "bbhit", bbhit)
-    bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit)
+    bbhit = cl.tools.get_or_register_dtype("bbhit", bbhit)
 
     preamble = bbhit_c_decl
 
@@ -2543,19 +2543,19 @@ def test_preamble_with_separate_temporaries(ctx_factory):
     data = np.random.rand(np.product(num_data))
 
     # make kernel
-    kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+    kernel = lp.make_kernel("{[i]: 0 <= i < n}",
     """
     for i
         <>ind = indirect(offsets[i], offsets[i + 1], 1)
         out[i] = data[ind]
     end
     """,
-    [lp.GlobalArg('out', shape=('n',)),
+    [lp.GlobalArg("out", shape=("n",)),
      lp.TemporaryVariable(
-        'offsets', shape=(offsets.size,), initializer=offsets,
+        "offsets", shape=(offsets.size,), initializer=offsets,
         address_space=lp.AddressSpace.GLOBAL,
         read_only=True),
-     lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+     lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)],
     )
 
     # fixt params, and add manglers / preamble
@@ -2564,13 +2564,13 @@ def test_preamble_with_separate_temporaries(ctx_factory):
             SeparateTemporariesPreambleTestPreambleGenerator,
             )
     func_info = dict(
-            func_name='indirect',
+            func_name="indirect",
             func_arg_dtypes=(np.int32, np.int32, np.int32),
             func_result_dtypes=(np.int32,),
             arr=lookup
             )
 
-    kernel = lp.fix_parameters(kernel, **{'n': n})
+    kernel = lp.fix_parameters(kernel, **{"n": n})
     kernel = lp.register_preamble_generators(
             kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)])
     kernel = lp.register_function_manglers(
@@ -2582,7 +2582,7 @@ def test_preamble_with_separate_temporaries(ctx_factory):
     queue = cl.CommandQueue(ctx)
     # check that it actually performs the lookup correctly
     assert np.allclose(kernel(
-        queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+        queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1])
 
 
 def test_arg_inference_for_predicates():
@@ -2715,7 +2715,7 @@ def test_dep_cycle_printing_and_error():
     # https://gitlab.tiker.net/inducer/loopy/issues/140
     # This kernel has two dep cycles.
 
-    knl = lp.make_kernel('{[i,j,k]: 0 <= i,j,k < 12}',
+    knl = lp.make_kernel("{[i,j,k]: 0 <= i,j,k < 12}",
     """
         for j
             for i
@@ -2735,11 +2735,11 @@ def test_dep_cycle_printing_and_error():
             end
         end
     """,
-    [lp.GlobalArg('a', shape=(12, 12), dtype=np.int32)])
+    [lp.GlobalArg("a", shape=(12, 12), dtype=np.int32)])
 
-    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
-    knl = lp.split_array_axis(knl, 'a', 1, 4)
-    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+    knl = lp.split_iname(knl, "j", 4, inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a", 1, 4)
+    knl = lp.tag_array_axes(knl, "a", "N1,N0,vec")
     knl = lp.preprocess_kernel(knl)
 
     from loopy.diagnostic import DependencyCycleFound
@@ -2758,7 +2758,7 @@ def test_backwards_dep_printing_and_error():
             d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
             a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
             """, [
-                lp.GlobalArg('a, b', dtype=np.float64),
+                lp.GlobalArg("a, b", dtype=np.float64),
                 "..."
             ])
 
@@ -2837,9 +2837,9 @@ def test_shape_mismatch_check(ctx_factory):
 def test_array_arg_extra_kwargs_persis_hash():
     from loopy.tools import LoopyKeyBuilder
 
-    a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64,
+    a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64,
             address_space=lp.AddressSpace.LOCAL)
-    not_a = lp.ArrayArg('a', shape=(10, ), dtype=np.float64,
+    not_a = lp.ArrayArg("a", shape=(10, ), dtype=np.float64,
             address_space=lp.AddressSpace.PRIVATE)
 
     key_builder = LoopyKeyBuilder()
@@ -2852,7 +2852,7 @@ def test_non_integral_array_idx_raises():
             """
             out[j] = 0 {id=init}
             out[i] = a[1.94**i-1] {dep=init}
-            """, [lp.GlobalArg('a', np.float64), '...'])
+            """, [lp.GlobalArg("a", np.float64), "..."])
 
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 6016c2f1c9955d3bd58d52ad33a3fa95ed63cff8..f7f88889b63f5811eb30a9d2b591e522d048ff01 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -78,7 +78,7 @@ def test_nbody(ctx_factory):
         knl = lp.split_iname(knl, "j", 256)
         knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
                 ["x_fetch_j", "x_fetch_k"],
-                fetch_outer_inames='i_outer, j_outer', default_tag=None)
+                fetch_outer_inames="i_outer, j_outer", default_tag=None)
         knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
         knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index fff2b5356e75f414356ea1c61c2dd54753186d26..1291de2b0e3e5f62676dcea30cce220a966d98ab 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -82,7 +82,7 @@ def test_tim2d(ctx_factory):
     def variant_orig(knl):
         knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))
 
-        knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e',
+        knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames="e",
                 default_tag="l.auto")
         knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto")
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 41a88b3864166b81d60ec0468cf9e5fbd07c227c..33565ef0007dff2b1ebf671dc0a55341d09c5053 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -66,13 +66,13 @@ def test_op_counter_basic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
+    params = {"n": n, "m": m, "ell": ell}
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == f32div == n*m*ell*n_subgroups
@@ -98,14 +98,14 @@ def test_op_counter_reduction():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
+    params = {"n": n, "m": m, "ell": ell}
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == n*m*ell*n_subgroups
 
-    op_map_dtype = op_map.group_by('dtype')
+    op_map_dtype = op_map.group_by("dtype")
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     assert f32 == f32add + f32mul
 
@@ -133,12 +133,12 @@ def test_op_counter_logic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
+    params = {"n": n, "m": m, "ell": ell}
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32mul == n*m*n_subgroups
@@ -171,18 +171,18 @@ def test_op_counter_specialops():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)
+    params = {"n": n, "m": m, "ell": ell}
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32div == 2*n*m*ell*n_subgroups
@@ -217,16 +217,16 @@ def test_op_counter_bitwise():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP)
+    params = {"n": n, "m": m, "ell": ell}
+    i32add = op_map[lp.Op(np.int32, "add", CG.SUBGROUP)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, "bw", CG.SUBGROUP)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP)
+    i64add = op_map[lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP)
                       ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert i32add == n*m*ell*n_subgroups
@@ -261,7 +261,7 @@ def test_op_counter_triangular_domain():
                     knl,
                     subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP)]
+                    )[lp.Op(np.float64, "mul", CG.SUBGROUP)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -297,31 +297,31 @@ def test_mem_access_counter_basic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32l = mem_map[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    f32l += mem_map[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64l = mem_map[lp.MemAccess('global', np.float64,
+    f64l = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    f64l += mem_map[lp.MemAccess('global', np.float64,
+    f64l += mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
@@ -329,14 +329,14 @@ def test_mem_access_counter_basic():
     assert f32l == (3*n*m*ell)*n_subgroups
     assert f64l == (2*n*m)*n_subgroups
 
-    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64s = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -361,39 +361,39 @@ def test_mem_access_counter_reduction():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32l = mem_map[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    f32l += mem_map[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32l == (2*n*m*ell)*n_subgroups
 
-    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32s == (n*ell)*n_subgroups
 
-    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+    ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"]
                                  ).to_bytes().eval_and_sum(params)
-    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+    st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"]
                                  ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*f32l
     assert st_bytes == 4*f32s
@@ -419,23 +419,23 @@ def test_mem_access_counter_logic():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+    reduced_map = mem_map.group_by("mtype", "dtype", "direction")
 
-    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
-                                       direction='load')
+    f32_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float32),
+                                       direction="load")
                           ].eval_with_dict(params)
-    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
-                                       direction='load')
+    f64_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64),
+                                       direction="load")
                           ].eval_with_dict(params)
-    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
-                                       direction='store')
+    f64_g_s = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64),
+                                       direction="store")
                           ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -464,31 +464,31 @@ def test_mem_access_counter_specialops():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    f32 = mem_map[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
-    f32 += mem_map[lp.MemAccess('global', np.float32,
+    f32 += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64 = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
-    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
+    f64 += mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -496,14 +496,14 @@ def test_mem_access_counter_specialops():
     assert f32 == (2*n*m*ell)*n_subgroups
     assert f64 == (2*n*m)*n_subgroups
 
-    f32 = mem_map[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
-    f64 = mem_map[lp.MemAccess('global', np.float64,
+    f64 = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
@@ -511,7 +511,7 @@ def test_mem_access_counter_specialops():
     assert f32 == (n*m*ell)*n_subgroups
     assert f64 == (n*m)*n_subgroups
 
-    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
+    filtered_map = mem_map.filter_by(direction=["load"], variable=["a", "g"],
                          count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
@@ -541,45 +541,45 @@ def test_mem_access_counter_bitwise():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
     n_subgroups = n_workgroups*subgroups_per_group
 
-    i32 = mem_map[lp.MemAccess('global', np.int32,
+    i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='a',
+                        direction="load", variable="a",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='b',
+                        direction="load", variable="b",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='g',
+                        direction="load", variable="g",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
+    i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32),
                         lid_strides={}, gid_strides={},
-                        direction='load', variable='h',
+                        direction="load", variable="h",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert i32 == (4*n*m+2*n*m*ell)*n_subgroups
 
-    i32 = mem_map[lp.MemAccess('global', np.int32,
+    i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='c',
+                        direction="store", variable="c",
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32,
+    i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
-                        direction='store', variable='e',
+                        direction="store", variable="e",
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -610,7 +610,7 @@ def test_mem_access_counter_mixed():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = div_ceil(ell, group_size_0)
     group_size = group_size_0
@@ -619,33 +619,33 @@ def test_mem_access_counter_mixed():
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
-    f64uniform = mem_map[lp.MemAccess('global', np.float64,
+    f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='g',
+                                direction="load", variable="g",
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
-    f64uniform += mem_map[lp.MemAccess('global', np.float64,
+    f64uniform += mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='h',
+                                direction="load", variable="h",
                                 count_granularity=CG.SUBGROUP)
                           ].eval_with_dict(params)
-    f32uniform = mem_map[lp.MemAccess('global', np.float32,
+    f32uniform = mem_map[lp.MemAccess("global", np.float32,
                                 lid_strides={}, gid_strides={},
-                                direction='load', variable='x',
+                                direction="load", variable="x",
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='load',
-                                variable='a',
+    f32nonconsec = mem_map[lp.MemAccess("global", np.dtype(np.float32),
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="load",
+                                variable="a",
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
-    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='load',
-                                variable='b',
+    f32nonconsec += mem_map[lp.MemAccess("global", np.dtype(np.float32),
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="load",
+                                variable="b",
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
@@ -670,16 +670,16 @@ def test_mem_access_counter_mixed():
     else:
         assert f32nonconsec == 3*n*m*ell
 
-    f64uniform = mem_map[lp.MemAccess('global', np.float64,
+    f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
-                                direction='store', variable='e',
+                                direction="store", variable="e",
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*group_size_0},
-                                direction='store',
-                                variable='c',
+    f32nonconsec = mem_map[lp.MemAccess("global", np.float32,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*group_size_0},
+                                direction="store",
+                                variable="c",
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
@@ -717,52 +717,52 @@ def test_mem_access_counter_nonconsec():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='load',
-                                variable='g',
+    params = {"n": n, "m": m, "ell": ell}
+    f64nonconsec = mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="load",
+                                variable="g",
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
-    f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='load',
-                                variable='h',
+    f64nonconsec += mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="load",
+                                variable="h",
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
-                            'global', np.dtype(np.float32),
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='load', variable='a',
+                            "global", np.dtype(np.float32),
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="load", variable="a",
                             count_granularity=CG.WORKITEM
                             )
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess(
-                            'global', np.dtype(np.float32),
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='load', variable='b',
+                            "global", np.dtype(np.float32),
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="load", variable="b",
                             count_granularity=CG.WORKITEM
                             )
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
 
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')},
-                                gid_strides={0: Variable('m')*lsize0},
-                                direction='store',
-                                variable='e',
+    f64nonconsec = mem_map[lp.MemAccess("global", np.float64,
+                                lid_strides={0: Variable("m")},
+                                gid_strides={0: Variable("m")*lsize0},
+                                direction="store",
+                                variable="e",
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
-                            'global', np.float32,
-                            lid_strides={0: Variable('m')*Variable('ell')},
-                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                            direction='store', variable='c',
+                            "global", np.float32,
+                            lid_strides={0: Variable("m")*Variable("ell")},
+                            gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                            direction="store", variable="c",
                             count_granularity=CG.WORKITEM
                             )
                            ].eval_with_dict(params)
@@ -772,37 +772,37 @@ def test_mem_access_counter_nonconsec():
     mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True,
                                       subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.float64,
-                    lid_strides={0: Variable('m')},
-                    gid_strides={0: Variable('m')*lsize0},
-                    direction='load', variable='g',
+                    lid_strides={0: Variable("m")},
+                    gid_strides={0: Variable("m")*lsize0},
+                    direction="load", variable="g",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.float64,
-                    lid_strides={0: Variable('m')},
-                    gid_strides={0: Variable('m')*lsize0},
-                    direction='load', variable='h',
+                    lid_strides={0: Variable("m")},
+                    gid_strides={0: Variable("m")*lsize0},
+                    direction="load", variable="h",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.dtype(np.float32),
-                    lid_strides={0: Variable('m')*Variable('ell')},
-                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                    direction='load',
-                    variable='a',
+                    lid_strides={0: Variable("m")*Variable("ell")},
+                    gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                    direction="load",
+                    variable="a",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
-                    'global',
+                    "global",
                     np.dtype(np.float32),
-                    lid_strides={0: Variable('m')*Variable('ell')},
-                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
-                    direction='load',
-                    variable='b',
+                    lid_strides={0: Variable("m")*Variable("ell")},
+                    gid_strides={0: Variable("m")*Variable("ell")*lsize0},
+                    direction="load",
+                    variable="b",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
@@ -825,52 +825,52 @@ def test_mem_access_counter_consec():
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size='guess')
+                                    subgroup_size="guess")
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     f64consec = mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='load', variable='g',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="load", variable="g",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='load', variable='h',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="load", variable="h",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
-                    'global', np.float32,
+                    "global", np.float32,
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='load', variable='a',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="load", variable="a",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess(
-                    'global', np.dtype(np.float32),
+                    "global", np.dtype(np.float32),
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='load', variable='b',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="load", variable="b",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess(
-                    'global', np.float64,
-                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
-                    direction='store', variable='e',
+                    "global", np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable("m")},
+                    direction="store", variable="e",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
-                    'global', np.float32,
+                    "global", np.float32,
                     lid_strides={0: 1},
-                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
-                    direction='store', variable='c',
+                    gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
+                    direction="store", variable="c",
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f64consec == n*m*ell
@@ -885,7 +885,7 @@ def test_count_granularity_val_checks():
         lp.MemAccess(count_granularity=CG.WORKGROUP)
         lp.MemAccess(count_granularity=None)
         assert True
-        lp.MemAccess(count_granularity='bushel')
+        lp.MemAccess(count_granularity="bushel")
         assert False
     except ValueError:
         assert True
@@ -896,7 +896,7 @@ def test_count_granularity_val_checks():
         lp.Op(count_granularity=CG.WORKGROUP)
         lp.Op(count_granularity=None)
         assert True
-        lp.Op(count_granularity='bushel')
+        lp.Op(count_granularity="bushel")
         assert False
     except ValueError:
         assert True
@@ -920,7 +920,7 @@ def test_barrier_counter_nobarriers():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     assert len(sync_map) == 1
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
@@ -947,7 +947,7 @@ def test_barrier_counter_barriers():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     barrier_count = sync_map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
@@ -970,7 +970,7 @@ def test_all_counters_parallel_matmul():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     group_size = bsize*bsize
     n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -983,16 +983,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
+                        lp.Op(np.float32, "mul", CG.SUBGROUP)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', CG.SUBGROUP)
+                        lp.Op(np.float32, "add", CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', CG.SUBGROUP)
+                        lp.Op(np.int32, "add", CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
+                        lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP)
                         ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1001,26 +1001,26 @@ def test_all_counters_parallel_matmul():
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                            subgroup_size=SGS)
 
-    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
+    f32s1lb = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
                              gid_strides={1: bsize},
-                             direction='load', variable='b',
+                             direction="load", variable="b",
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
-    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('m')},
-                             gid_strides={0: Variable('m')*bsize},
-                             direction='load',
-                             variable='a', count_granularity=CG.WORKITEM)
+    f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("m")},
+                             gid_strides={0: Variable("m")*bsize},
+                             direction="load",
+                             variable="a", count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
-    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
-                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
-                             direction='store', variable='c',
+    f32coal = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
+                             gid_strides={0: Variable("ell")*bsize, 1: bsize},
+                             direction="store", variable="c",
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
@@ -1028,32 +1028,32 @@ def test_all_counters_parallel_matmul():
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True,
-                        subgroup_size=SGS).filter_by(mtype=['local'])
+                        subgroup_size=SGS).filter_by(mtype=["local"])
 
-    local_mem_l = local_mem_map.filter_by(direction=['load']
+    local_mem_l = local_mem_map.filter_by(direction=["load"]
                                           ).eval_and_sum(params)
     # (count-per-sub-group)*n_subgroups
     assert local_mem_l == m*2*n_subgroups
 
-    local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                               direction='load',
+    local_mem_l_a = local_mem_map[lp.MemAccess("local", np.dtype(np.float32),
+                                               direction="load",
                                                lid_strides={1: 16},
                                                gid_strides={},
-                                               variable='a_fetch',
+                                               variable="a_fetch",
                                                count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
-    local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                               direction='load',
+    local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32),
+                                               direction="load",
                                                lid_strides={0: 1},
                                                gid_strides={},
-                                               variable='b_fetch',
+                                               variable="b_fetch",
                                                count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
     assert local_mem_l_a == local_mem_l_b == m*n_subgroups
 
-    local_mem_s = local_mem_map.filter_by(direction=['store']
+    local_mem_s = local_mem_map.filter_by(direction=["store"]
                                           ).eval_and_sum(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1122,7 +1122,7 @@ def test_mem_access_tagged_variables():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
     group_size = bsize*bsize
     n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -1131,19 +1131,19 @@ def test_mem_access_tagged_variables():
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                            subgroup_size=SGS)
 
-    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
+    f32s1lb = mem_access_map[lp.MemAccess("global", np.float32,
                              lid_strides={0: 1},
                              gid_strides={1: bsize},
-                             direction='load', variable='b',
-                             variable_tag='mmbload',
+                             direction="load", variable="b",
+                             variable_tag="mmbload",
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
-    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={1: Variable('m')},
-                             gid_strides={0: Variable('m')*bsize},
-                             direction='load',
-                             variable='a',
-                             variable_tag='mmaload',
+    f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={1: Variable("m")},
+                             gid_strides={0: Variable("m")*bsize},
+                             direction="load",
+                             variable="a",
+                             variable_tag="mmaload",
                              count_granularity=CG.SUBGROUP)
                              ].eval_with_dict(params)
 
@@ -1152,11 +1152,11 @@ def test_mem_access_tagged_variables():
     # uniform: (count-per-sub-group)*n_subgroups
     assert f32s1la == m*n_subgroups
 
-    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('ell')},
-                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
-                             direction='store', variable='c',
-                             variable_tag='mmresult',
+    f32coal = mem_access_map[lp.MemAccess("global", np.float32,
+                             lid_strides={0: 1, 1: Variable("ell")},
+                             gid_strides={0: Variable("ell")*bsize, 1: bsize},
+                             direction="store", variable="c",
+                             variable_tag="mmresult",
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
@@ -1213,7 +1213,7 @@ def test_summations_and_filters():
     n = 512
     m = 256
     ell = 128
-    params = {'n': n, 'm': m, 'ell': ell}
+    params = {"n": n, "m": m, "ell": ell}
 
     n_workgroups = 1
     group_size = 1
@@ -1223,24 +1223,24 @@ def test_summations_and_filters():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
 
-    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
+    loads_a = mem_map.filter_by(direction=["load"], variable=["a"],
                                 count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert loads_a == (2*n*m*ell)*n_subgroups
 
-    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
+    global_stores = mem_map.filter_by(mtype=["global"], direction=["store"],
                                       count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
     assert global_stores == (n*m*ell + n*m)*n_subgroups
 
-    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
+    ld_bytes = mem_map.filter_by(mtype=["global"], direction=["load"],
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
-    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
+    st_bytes = mem_map.filter_by(mtype=["global"], direction=["store"],
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
@@ -1249,10 +1249,10 @@ def test_summations_and_filters():
     assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups
 
     # ignore stride and variable names in this map
-    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
-    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
+    reduced_map = mem_map.group_by("mtype", "dtype", "direction")
+    f32lall = reduced_map[lp.MemAccess("global", np.float32, direction="load")
                           ].eval_with_dict(params)
-    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
+    f64lall = reduced_map[lp.MemAccess("global", np.float64, direction="load")
                           ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -1264,7 +1264,7 @@ def test_summations_and_filters():
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-    op_map_dtype = op_map.group_by('dtype')
+    op_map_dtype = op_map.group_by("dtype")
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
     i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
@@ -1272,7 +1272,7 @@ def test_summations_and_filters():
     assert f64 == n*m
     assert i32 == n*m*2
 
-    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
+    addsub_all = op_map.filter_by(name=["add", "sub"]).eval_and_sum(params)
     f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
     assert addsub_all == n*m*ell + n*m*2
     assert f32ops_all == n*m*ell*3
@@ -1280,16 +1280,16 @@ def test_summations_and_filters():
     non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
     assert non_field == 0
 
-    ops_nodtype = op_map.group_by('name')
-    ops_noname = op_map.group_by('dtype')
-    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
+    ops_nodtype = op_map.group_by("name")
+    ops_noname = op_map.group_by("dtype")
+    mul_all = ops_nodtype[lp.Op(name="mul")].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
     assert mul_all == n*m*ell + n*m
     assert f64ops_all == n*m
 
     def func_filter(key):
         return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
-               key.direction == 'load'
+               key.direction == "load"
     f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -1313,7 +1313,7 @@ def test_strided_footprint():
     knl = lp.split_iname(knl, "i_inner", bx, outer_tag="unr", inner_tag="l.0")
 
     footprints = lp.gather_access_footprints(knl)
-    x_l_foot = footprints[('x', 'read')]
+    x_l_foot = footprints[("x", "read")]
 
     from loopy.statistics import count
     num = count(knl, x_l_foot).eval_with_dict(param_dict)
diff --git a/test/test_target.py b/test/test_target.py
index e27f6a32a3e84ce29ac9b6d0c817c989ee75058e..38f2017cac73efb56addd13191a510b4991941ba 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -48,7 +48,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -279,10 +279,10 @@ def test_numba_cuda_target():
         target=lp.NumbaCudaTarget())
 
     knl = lp.assume(knl, "M>0")
-    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
-    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
+    knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1))
     knl = lp.add_prefetch(knl, "X[i,:]",
-            fetch_outer_inames='i_inner, i_outer, j_inner',
+            fetch_outer_inames="i_inner, i_outer, j_inner",
             default_tag="l.auto")
     knl = lp.fix_parameters(knl, N=3)
     knl = lp.prioritize_loops(knl, "i_inner,j_outer")
@@ -327,7 +327,7 @@ def test_child_invalid_type_cast():
 
 
 def test_target_invalid_type_cast():
-    dtype = np.dtype([('', '<u4'), ('', '<i4')])
+    dtype = np.dtype([("", "<u4"), ("", "<i4")])
     with pytest.raises(lp.LoopyError):
         lp.TypeCast(dtype, 1)
 
@@ -383,7 +383,7 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     # test numpy input for x is written to and returned
-    knl = lp.make_kernel('{:}', ['x[0] = y[0] + x[0]'])
+    knl = lp.make_kernel("{:}", ["x[0] = y[0] + x[0]"])
 
     y = np.array([3.])
     x = np.array([4.])
@@ -394,14 +394,14 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
     # test numpy input for x is written to and returned, even when a pyopencl array
     # is passed for y
     import pyopencl.array as cla
-    y = cla.zeros(queue, shape=(1), dtype='float64') + 3.
+    y = cla.zeros(queue, shape=(1), dtype="float64") + 3.
     x = np.array([4.])
     evt, out = knl(queue, y=y, x=x)
     assert out[0] is x
     assert x[0] == 7.
 
     # test numpy input for x is written to and returned, even when output-only
-    knl = lp.make_kernel('{:}', ['x[0] = y[0] + 2'])
+    knl = lp.make_kernel("{:}", ["x[0] = y[0] + 2"])
 
     y = np.array([3.])
     x = np.array([4.])
diff --git a/test/test_transform.py b/test/test_transform.py
index e4ca2af0d657cea7769c9a573c14a79e8c197132..9940786f397613d046944db880d5dbda9ffc585f 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -45,7 +45,7 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \
 
 __all__ = [
         "pytest_generate_tests",
-        "cl"  # 'cl.create_some_context'
+        "cl"  # "cl.create_some_context"
         ]
 
 
@@ -98,8 +98,8 @@ def test_to_batched(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' out[i] = sum(j, a[i,j]*x[j])''')
+         """ { [i,j]: 0<=i,j<n } """,
+         """ out[i] = sum(j, a[i,j]*x[j])""")
     knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                             x=np.float32,
                                             a=np.float32))
@@ -107,8 +107,8 @@ def test_to_batched(ctx_factory):
     bknl = lp.to_batched(knl, "nbatches", "out,x")
 
     ref_knl = lp.make_kernel(
-         ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
-         '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
+         """ { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} """,
+         """out[k, i] = sum(j, a[i,j]*x[k, j])""")
     ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                     x=np.float32,
                                                     a=np.float32))
@@ -128,20 +128,20 @@ def test_to_batched_temp(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' cnst = 2.0
-         out[i] = sum(j, cnst*a[i,j]*x[j])''',
+         """ { [i,j]: 0<=i,j<n } """,
+         """ cnst = 2.0
+         out[i] = sum(j, cnst*a[i,j]*x[j])""",
          [lp.TemporaryVariable(
              "cnst",
              dtype=np.float32,
              shape=(),
-             address_space=lp.AddressSpace.PRIVATE), '...'])
+             address_space=lp.AddressSpace.PRIVATE), "..."])
     knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                             x=np.float32,
                                             a=np.float32))
     ref_knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
+         """ { [i,j]: 0<=i,j<n } """,
+         """out[i] = sum(j, 2.0*a[i,j]*x[j])""")
     ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                     x=np.float32,
                                                     a=np.float32))
@@ -150,7 +150,7 @@ def test_to_batched_temp(ctx_factory):
     bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")
 
     # checking that cnst is not being bathced
-    assert bknl.temporary_variables['cnst'].shape == ()
+    assert bknl.temporary_variables["cnst"].shape == ()
 
     a = np.random.randn(5, 5)
     x = np.random.randn(7, 5)
@@ -187,8 +187,8 @@ def test_rename_argument(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     kernel = lp.make_kernel(
-         '''{ [i]: 0<=i<n }''',
-         '''out[i] = a + 2''')
+         """{ [i]: 0<=i<n }""",
+         """out[i] = a + 2""")
 
     kernel = lp.rename_argument(kernel, "a", "b")
 
@@ -199,14 +199,14 @@ def test_rename_argument(ctx_factory):
 
 def test_fusion():
     exp_kernel = lp.make_kernel(
-         ''' { [i]: 0<=i<n } ''',
-         ''' exp[i] = pow(E, z[i])''',
+         """ { [i]: 0<=i<n } """,
+         """ exp[i] = pow(E, z[i])""",
          assumptions="n>0")
 
     sum_kernel = lp.make_kernel(
-        '{ [j]: 0<=j<n }',
-        'out2 = sum(j, exp[j])',
-        assumptions='n>0')
+        "{ [j]: 0<=j<n }",
+        "out2 = sum(j, exp[j])",
+        assumptions="n>0")
 
     knl = lp.fuse_kernels([exp_kernel, sum_kernel])
 
@@ -374,8 +374,8 @@ def test_precompute_confusing_subst_arguments(ctx_factory):
 
     from loopy.symbolic import get_dependencies
     assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D", sweep_inames='j',
-            precompute_outer_inames='j, i_inner, i_outer')
+    knl = lp.precompute(knl, "D", sweep_inames="j",
+            precompute_outer_inames="j, i_inner, i_outer")
 
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl,
@@ -541,13 +541,13 @@ def test_split_iname_only_if_in_within():
             a[i] = 2*b[i] {id=not_to_split}
             """)
 
-    knl = lp.split_iname(knl, "i", 4, within='id:to_split')
+    knl = lp.split_iname(knl, "i", 4, within="id:to_split")
 
     for insn in knl.instructions:
-        if insn.id == 'to_split':
-            assert insn.within_inames == frozenset({'i_outer', 'i_inner'})
-        if insn.id == 'not_to_split':
-            assert insn.within_inames == frozenset({'i'})
+        if insn.id == "to_split":
+            assert insn.within_inames == frozenset({"i_outer", "i_inner"})
+        if insn.id == "not_to_split":
+            assert insn.within_inames == frozenset({"i"})
 
 
 def test_nested_substs_in_insns(ctx_factory):
@@ -576,11 +576,11 @@ def test_extract_subst_with_iname_deps_in_templ(ctx_factory):
             """
             y[i, j, k] = x[i, j, k]
             """,
-            [lp.GlobalArg('x,y', shape=lp.auto, dtype=float)],
+            [lp.GlobalArg("x,y", shape=lp.auto, dtype=float)],
             lang_version=(2018, 2))
 
-    knl = lp.extract_subst(knl, 'rule1', 'x[i, arg1, arg2]',
-            parameters=('arg1', 'arg2'))
+    knl = lp.extract_subst(knl, "rule1", "x[i, arg1, arg2]",
+            parameters=("arg1", "arg2"))
 
     lp.auto_test_vs_ref(knl, ctx_factory(), knl)
 
@@ -660,12 +660,12 @@ def test_add_inames_for_unused_hw_axes(ctx_factory):
 
     knl = lp.add_inames_for_unused_hw_axes(knl)
 
-    assert knl.id_to_insn['init_alpha'].within_inames == frozenset(['i_inner',
-        'i_outer', 'j_outer', 'j_inner'])
-    assert knl.id_to_insn['a_fetch_rule'].within_inames == frozenset(['i_inner',
-        'i_outer', 'j_outer', 'j_inner'])
-    assert knl.id_to_insn['b_fetch_rule'].within_inames == frozenset(['i_inner',
-        'i_outer', 'j_outer', 'j_inner'])
+    assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
+    assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
+    assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner",
+        "i_outer", "j_outer", "j_inner"])
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"],
diff --git a/test/testlib.py b/test/testlib.py
index 67c5ba04fefde9a7516f22bce679744ce61a4f20..ffccf6c1a167e96ec3510180cc7928ba68c1e0f4 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -57,15 +57,15 @@ class SeparateTemporariesPreambleTestMangler(
 
         # check types
         if len(arg_dtypes) != len(arg_dtypes):
-            raise Exception('Unexpected number of arguments provided to mangler '
-                            '{}, expected {}, got {}'.format(
+            raise Exception("Unexpected number of arguments provided to mangler "
+                            "{}, expected {}, got {}".format(
                                 self.func_name, len(self.func_arg_dtypes),
                                 len(arg_dtypes)))
 
         for i, (d1, d2) in enumerate(zip(self.func_arg_dtypes, arg_dtypes)):
             if not __compare(d1, d2):
-                raise Exception('Argument at index {} for mangler {} does not '
-                                'match expected dtype.  Expected {}, got {}'.
+                raise Exception("Argument at index {} for mangler {} does not "
+                                "match expected dtype.  Expected {}, got {}".
                                 format(i, self.func_name, str(d1), str(d2)))
 
         # get target for creation
@@ -85,7 +85,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
         func_match = next(
             (x for x in preamble_info.seen_functions
              if x.name == self.func_name), None)
-        desc = 'custom_funcs_indirect'
+        desc = "custom_funcs_indirect"
         if func_match is not None:
             from loopy.types import to_loopy_type
             # check types
@@ -93,7 +93,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
                     func_match.arg_dtypes:
                 # if match, create our temporary
                 var = lp.TemporaryVariable(
-                    'lookup', initializer=self.arr, dtype=self.arr.dtype,
+                    "lookup", initializer=self.arr, dtype=self.arr.dtype,
                     shape=self.arr.shape,
                     address_space=lp.AddressSpace.GLOBAL, read_only=True)
                 # and code
@@ -127,7 +127,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
             decl = Initializer(decl, generate_array_literal(
                 codegen_state, var, var.initializer))
         # return generated code
-        yield (desc, '\n'.join([str(decl), code]))
+        yield (desc, "\n".join([str(decl), code]))
 
 # }}}