diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 70017ee228c054b2818428e997257de638c5a2fb..a865134add02fd7527bf336d0c3596e2777aa232 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
             uses: actions/setup-python@v1
             with:
                 # matches compat target in setup.py
-                python-version: '3.6'
+                python-version: '3.8'
         -   name: "Main Script"
             run: |
                 curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
@@ -35,6 +35,19 @@ jobs:
                 curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
                 . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
 
+    mypy:
+        name: Mypy
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                curl -L -O https://tiker.net/ci-support-v0
+                . ./ci-support-v0
+                build_py_project_in_conda_env
+                python -m pip install mypy
+                ./run-mypy.sh
+
     pytest:
         name: Conda Pytest
         runs-on: ubuntu-latest
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f8e08058c3e5d61f56655d88a5cebbab8a5f202c..5a8273849e03cab0dfd632f8f18f47bfa659314b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -168,6 +168,18 @@ Flake8:
   except:
   - tags
 
+Mypy:
+  script: |
+    curl -L -O https://tiker.net/ci-support-v0
+    . ./ci-support-v0
+    build_py_project_in_venv
+    python -m pip install mypy
+    ./run-mypy.sh
+  tags:
+  - python3
+  except:
+  - tags
+
 Benchmarks:
   stage: test
   script:
diff --git a/doc/conf.py b/doc/conf.py
index 7cd9884fd5041a55643fba7e31afea54a8ab1b1f..ce500cbecd1d15ab97c6bfa95934b4034d0bf17c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -35,6 +35,25 @@ intersphinx_mapping = {
     "https://pyrsistent.readthedocs.io/en/latest/": None,
     }
 
+# Some modules need to import things just so that sphinx can resolve symbols in
+# type annotations. Often, we do not want these imports (e.g. of PyOpenCL) when
+# in normal use (because they would introduce unintended side effects or hard
+# dependencies). This flag exists so that these imports only occur during doc
+# build. Since sphinx appears to resolve type hints lexically (as it should),
+# this needs to be cross-module (since, e.g. an inherited arraycontext
+# docstring can be read by sphinx when building meshmode, a dependent package),
+# this needs a setting of the same name across all packages involved, that's
+# why this name is as global-sounding as it is.
+import sys
+sys._BUILDING_SPHINX_DOCS = True
+
 nitpick_ignore_regex = [
         ["py:class", r"typing_extensions\.(.+)"],
+        ["py:class", r"numpy\.u?int[0-9]+"],
+        ["py:class", r"numpy\.float[0-9]+"],
+        ["py:class", r"numpy\.complex[0-9]+"],
+
+        # As of 2022-06-22, it doesn't look like there's sphinx documentation
+        # available.
+        ["py:class", r"immutables\.(.+)"],
         ]
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 922315685579089e787389ec5c2f74c56b690092..c53c56530d43a2ec03b2126cc70cbaf3221caf33 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -515,24 +515,14 @@ Arguments
 ^^^^^^^^^
 
 .. autoclass:: KernelArgument
-    :members:
-    :undoc-members:
 
 .. autoclass:: ValueArg
-    :members:
-    :undoc-members:
 
 .. autoclass:: ArrayArg
-    :members:
-    :undoc-members:
 
 .. autoclass:: ConstantArg
-    :members:
-    :undoc-members:
 
 .. autoclass:: ImageArg
-    :members:
-    :undoc-members:
 
 .. _temporaries:
 
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index b3cfbc5c485f9670f59de4edda80ffabf6e23076..9ef012d663e7a8eb32c2cb1fa236018200338600 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -52,6 +52,8 @@ Influencing data access
 
 .. automodule:: loopy.transform.privatize
 
+.. autofunction:: allocate_temporaries_for_base_storage
+
 Padding Data
 ------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 7fe11871b1fc91f9d64232f0e145d7ee8b6a6819..13a597e5b2c82b2ec6ea5c872b6edc6ee70e29d6 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -235,7 +235,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`:
         if allocator is None:
             allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
     <BLANKLINE>
-        # {{{ find integer arguments from shapes
+        # {{{ find integer arguments from array data
     <BLANKLINE>
         if n is None:
             if a is not None:
@@ -1228,11 +1228,11 @@ should call :func:`loopy.get_one_linearized_kernel`:
    ...
    ---------------------------------------------------------------------------
    LINEARIZATION:
-      0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
+      0: CALL KERNEL rotate_v2
       1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2: RETURN FROM KERNEL rotate_v2
       3: ... gbarrier
-      4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
+      4: CALL KERNEL rotate_v2_0
       5:     arr[(1 + i_inner + i_outer*16) % n] = tmp  {id=rotate}
       6: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
@@ -1260,18 +1260,18 @@ put those instructions into the schedule.
    ...
    ---------------------------------------------------------------------------
    TEMPORARIES:
-   tmp: type: np:dtype('int32'), shape: () aspace:private
-   tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace:global
+   tmp: type: np:dtype('int32'), shape: () aspace: private
+   tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace: global
    ---------------------------------------------------------------------------
    ...
    ---------------------------------------------------------------------------
    LINEARIZATION:
-      0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
+      0: CALL KERNEL rotate_v2
       1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2:     tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp  {id=tmp.save}
       3: RETURN FROM KERNEL rotate_v2
       4: ... gbarrier
-      5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
+      5: CALL KERNEL rotate_v2_0
       6:     tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]  {id=tmp.reload}
       7:     arr[(1 + i_inner + i_outer*16) % n] = tmp  {id=rotate}
       8: RETURN FROM KERNEL rotate_v2_0
@@ -1297,7 +1297,7 @@ The kernel translates into two OpenCL kernels.
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int const *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
    {
      int tmp;
    <BLANKLINE>
@@ -1305,7 +1305,7 @@ The kernel translates into two OpenCL kernels.
      tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
    }
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int const *__restrict__ tmp_save_slot)
    {
      int tmp;
    <BLANKLINE>
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 7bebb98bd6804520d7187cd3d4ad302848633fe7..ce3ba1439659b3dd118dc011e94ecb09c2f4f343 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -95,7 +95,8 @@ from loopy.transform.data import (
         alias_temporaries, set_argument_order,
         rename_argument,
         set_temporary_scope,
-        set_temporary_address_space)
+        set_temporary_address_space,
+        allocate_temporaries_for_base_storage)
 
 from loopy.transform.subst import (extract_subst,
         assignment_to_subst, expand_subst, find_rules_matching,
@@ -157,7 +158,6 @@ from loopy.target.cuda import CudaTarget
 from loopy.target.opencl import OpenCLTarget
 from loopy.target.pyopencl import PyOpenCLTarget
 from loopy.target.ispc import ISPCTarget
-from loopy.target.numba import NumbaTarget, NumbaCudaTarget
 
 from loopy.tools import Optional, t_unit_to_python, memoize_on_disk
 
@@ -216,6 +216,7 @@ __all__ = [
         "remove_unused_arguments",
         "alias_temporaries", "set_argument_order",
         "rename_argument", "set_temporary_scope", "set_temporary_address_space",
+        "allocate_temporaries_for_base_storage",
 
         "find_instructions", "map_instructions",
         "set_instruction_priority", "add_dependency",
@@ -302,7 +303,6 @@ __all__ = [
         "CWithGNULibcTarget", "ExecutableCWithGNULibcTarget",
         "CudaTarget", "OpenCLTarget",
         "PyOpenCLTarget", "ISPCTarget",
-        "NumbaTarget", "NumbaCudaTarget",
         "ASTBuilderBase",
 
         "Optional", "memoize_on_disk",
@@ -366,7 +366,7 @@ def set_options(kernel, *args, **kwargs):
 # {{{ library registration
 
 @for_each_kernel
-def register_preamble_generators(kernel, preamble_generators):
+def register_preamble_generators(kernel: LoopKernel, preamble_generators):
     """
     :arg manglers: list of functions of signature ``(preamble_info)``
         generating tuples ``(sortable_str_identifier, code)``,
@@ -376,7 +376,8 @@ def register_preamble_generators(kernel, preamble_generators):
     """
     from loopy.tools import unpickles_equally
 
-    new_pgens = kernel.preamble_generators[:]
+    new_pgens = tuple(kernel.preamble_generators)
+
     for pgen in preamble_generators:
         if pgen not in new_pgens:
             if not unpickles_equally(pgen):
@@ -385,7 +386,7 @@ def register_preamble_generators(kernel, preamble_generators):
                         "and would thus disrupt loopy's caches"
                         % pgen)
 
-            new_pgens.insert(0, pgen)
+            new_pgens = (pgen,) + new_pgens
 
     return kernel.copy(preamble_generators=new_pgens)
 
@@ -394,7 +395,7 @@ def register_preamble_generators(kernel, preamble_generators):
 def register_symbol_manglers(kernel, manglers):
     from loopy.tools import unpickles_equally
 
-    new_manglers = kernel.symbol_manglers[:]
+    new_manglers = kernel.symbol_manglers
     for m in manglers:
         if m not in new_manglers:
             if not unpickles_equally(m):
@@ -403,7 +404,7 @@ def register_symbol_manglers(kernel, manglers):
                         "and would disrupt loopy's caches"
                         % m)
 
-            new_manglers.insert(0, m)
+            new_manglers = (m,) + new_manglers
 
     return kernel.copy(symbol_manglers=new_manglers)
 
@@ -484,7 +485,8 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None):
     result = make_kernel(set_str,
             "output[%s] = input[%s]"
             % (commad_indices, commad_indices),
-            lang_version=MOST_RECENT_LANGUAGE_VERSION)
+            lang_version=MOST_RECENT_LANGUAGE_VERSION,
+            default_offset=auto)
 
     result = tag_array_axes(result, "input", old_dim_tags)
     result = tag_array_axes(result, "output", new_dim_tags)
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 8b89391e29b1d1f1d770aab24434a94ed4562a7b..2b5ccb54d5a18ed2f4ceaf762619aea73fa4d86d 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -20,16 +20,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import TYPE_CHECKING, Tuple, Optional
+from dataclasses import dataclass
 from warnings import warn
 
-from pytools import Record
-
 import numpy as np
 
 import loopy as lp
+from loopy.kernel.array import get_strides
 
 from loopy.diagnostic import LoopyError, AutomaticTestFailure
 
+if TYPE_CHECKING:
+    import pyopencl.array as cla
+
 
 AUTO_TEST_SKIP_RUN = False
 
@@ -68,13 +72,34 @@ def fill_rand(ary):
         fill_rand(ary)
 
 
-class TestArgInfo(Record):
-    pass
+@dataclass
+class TestArgInfo:
+    name: str
+    ref_array: "cla.Array"
+    ref_storage_array: "cla.Array"
+
+    ref_pre_run_array: "cla.Array"
+    ref_pre_run_storage_array: "cla.Array"
+
+    ref_shape: Tuple[int, ...]
+    ref_strides: Tuple[int, ...]
+    ref_alloc_size: int
+    ref_numpy_strides: Tuple[int, ...]
+    needs_checking: bool
+
+    # The attributes below are being modified in make_args, hence this dataclass
+    # cannot be frozen.
+    test_storage_array: Optional["cla.Array"] = None
+    test_array: Optional["cla.Array"] = None
+    test_shape: Optional[Tuple[int, ...]] = None
+    test_strides: Optional[Tuple[int, ...]] = None
+    test_numpy_strides: Optional[Tuple[int, ...]] = None
+    test_alloc_size: Optional[Tuple[int, ...]] = None
 
 
 # {{{ "reference" arguments
 
-def make_ref_args(kernel, impl_arg_info, queue, parameters):
+def make_ref_args(kernel, queue, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
@@ -86,13 +111,8 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
     ref_args = {}
     ref_arg_data = []
 
-    for arg in impl_arg_info:
-        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
-
-        if arg.arg_class is ValueArg:
-            if arg.offset_for_name:
-                continue
-
+    for arg in kernel.args:
+        if isinstance(arg, ValueArg):
             arg_value = parameters[arg.name]
 
             try:
@@ -107,25 +127,24 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
 
             ref_arg_data.append(None)
 
-        elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \
-                or arg.arg_class is ConstantArg:
+        elif isinstance(arg, (ArrayArg, ImageArg, ConstantArg)):
             if arg.shape is None or any(saxis is None for saxis in arg.shape):
                 raise LoopyError("array '%s' needs known shape to use automatic "
                         "testing" % arg.name)
 
-            shape = evaluate_shape(arg.unvec_shape, parameters)
-            dtype = kernel_arg.dtype
+            shape = evaluate_shape(arg.shape, parameters)
+            dtype = arg.dtype
 
-            is_output = kernel_arg.is_output
+            is_output = arg.is_output
 
-            if arg.arg_class is ImageArg:
+            if isinstance(arg, ImageArg):
                 storage_array = ary = cl_array.empty(
                         queue, shape, dtype, order="C")
                 numpy_strides = None
                 alloc_size = None
                 strides = None
             else:
-                strides = evaluate(arg.unvec_strides, parameters)
+                strides = evaluate(get_strides(arg), parameters)
 
                 alloc_size = sum(astrd*(alen-1) if astrd != 0 else alen-1
                         for alen, astrd in zip(shape, strides)) + 1
@@ -142,13 +161,13 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
 
                 storage_array = cl_array.empty(queue, alloc_size, dtype)
 
-            if is_output and arg.arg_class is ImageArg:
+            if is_output and isinstance(arg, ImageArg):
                 raise LoopyError("write-mode images not supported in "
                         "automatic testing")
 
             fill_rand(storage_array)
 
-            if arg.arg_class is ImageArg:
+            if isinstance(arg, ImageArg):
                 # must be contiguous
                 pre_run_ary = pre_run_storage_array = storage_array.copy()
 
@@ -191,20 +210,17 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
 
 # {{{ "full-scale" arguments
 
-def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
+def make_args(kernel, queue, ref_arg_data, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
-    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\
-            TemporaryVariable, ConstantArg
+    from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, ConstantArg
 
     from pymbolic import evaluate
 
     args = {}
-    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
-        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
-
-        if arg.arg_class is ValueArg:
+    for arg, arg_desc in zip(kernel.args, ref_arg_data):
+        if isinstance(arg, ValueArg):
             arg_value = parameters[arg.name]
 
             try:
@@ -217,24 +233,23 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
 
             args[arg.name] = arg_value
 
-        elif arg.arg_class is ImageArg:
+        elif isinstance(arg, ImageArg):
             if arg.name in kernel.get_written_variables():
                 raise NotImplementedError("write-mode images not supported in "
                         "automatic testing")
 
-            shape = evaluate_shape(arg.unvec_shape, parameters)
+            shape = evaluate_shape(arg.shape, parameters)
             assert shape == arg_desc.ref_shape
 
             # must be contiguous
             args[arg.name] = cl.image_from_array(
                     queue.context, arg_desc.ref_pre_run_array.get())
 
-        elif arg.arg_class is ArrayArg or\
-                arg.arg_class is ConstantArg:
-            shape = evaluate(arg.unvec_shape, parameters)
-            strides = evaluate(arg.unvec_strides, parameters)
+        elif isinstance(arg, (ArrayArg, ConstantArg)):
+            shape = evaluate(arg.shape, parameters)
+            strides = evaluate(get_strides(arg), parameters)
 
-            dtype = kernel_arg.dtype
+            dtype = arg.dtype
             itemsize = dtype.itemsize
             numpy_strides = [itemsize*s for s in strides]
 
@@ -280,10 +295,6 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
             arg_desc.test_numpy_strides = numpy_strides
             arg_desc.test_alloc_size = alloc_size
 
-        elif arg.arg_class is TemporaryVariable:
-            # global temporary, handled by invocation logic
-            pass
-
         else:
             raise LoopyError("arg type not understood")
 
@@ -309,6 +320,7 @@ def _default_check_result(result, ref_result):
                 / np.max(np.abs(ref_result-result)))
         # pylint: disable=bad-string-format-type
         return (False,
+                # pylint: disable=bad-string-format-type
                 "results do not match -- (rel) l_2 err: %g, l_inf err: %g"
                 % (l2_err, linf_err))
     else:
@@ -455,9 +467,6 @@ def auto_test_vs_ref(
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
         ref_codegen_result = lp.generate_code_v2(ref_prog)
 
-        ref_implemented_data_info = ref_codegen_result.implemented_data_infos[
-                ref_entrypoint]
-
         logger.info("{} (ref): trying {} for the reference calculation".format(
             ref_entrypoint, dev))
 
@@ -471,9 +480,7 @@ def auto_test_vs_ref(
 
         try:
             ref_args, ref_arg_data = \
-                    make_ref_args(ref_prog[ref_entrypoint],
-                            ref_implemented_data_info,
-                            ref_queue, parameters)
+                    make_ref_args(ref_prog[ref_entrypoint], ref_queue, parameters)
             ref_args["out_host"] = False
         except cl.RuntimeError as e:
             if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
@@ -548,8 +555,6 @@ def auto_test_vs_ref(
     test_prog_codegen_result = lp.generate_code_v2(test_prog)
 
     args = make_args(test_prog[test_entrypoint],
-            test_prog_codegen_result.implemented_data_infos[
-                test_entrypoint],
             queue, ref_arg_data, parameters)
     args["out_host"] = False
 
diff --git a/loopy/check.py b/loopy/check.py
index 9d0c04f28c55c9f123a874913c13e56c33497e54..0f3faced863685c389d04ff2f9db3d588684e118 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -20,9 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import Union, Tuple, Optional, List
 
+import numpy as np
 from islpy import dim_type
 import islpy as isl
+
 from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction
 from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning,
         warn_with_kernel, LoopyIndexError)
@@ -30,7 +33,13 @@ from loopy.type_inference import TypeReader
 from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
                                       CInstruction, _DataObliviousInstruction,
                                       NoOpInstruction)
+from loopy.kernel import LoopKernel
+from loopy.kernel.array import (
+        FixedStrideArrayDimTag, SeparateArrayArrayDimTag, ArrayBase)
+from loopy.kernel.data import auto, ArrayArg, ArrayDimImplementationTag
 from loopy.translation_unit import for_each_kernel
+from loopy.typing import ExpressionT
+
 from pytools import memoize_method
 
 from collections import defaultdict
@@ -146,6 +155,131 @@ def check_functions_are_resolved(kernel):
         else:
             raise NotImplementedError(type(insn))
 
+
+@for_each_kernel
+def check_separated_array_consistency(kernel: LoopKernel) -> None:
+    # Boo. This is (part of) the price of redundant representation.
+    for arg in kernel.args:
+        if isinstance(arg, ArrayArg) and arg._separation_info is not None:
+            sep_indices = arg._separation_info.sep_axis_indices_set
+            for subarg_name in arg._separation_info.subarray_names.values():
+                sub_arg = kernel.arg_dict[subarg_name]
+
+                from loopy.preprocess import _remove_at_indices
+
+                assert arg.shape is None or isinstance(arg.shape, tuple)
+                if _remove_at_indices(sep_indices, arg.shape) != sub_arg.shape:
+                    raise LoopyError(
+                            f"Shapes of '{arg.name}' and associated sep array "
+                            "'{sub_arg.name}' are not consistent.")
+
+                assert arg.dim_tags is None or isinstance(arg.dim_tags, tuple)
+                if _remove_at_indices(sep_indices, arg.dim_tags) != sub_arg.dim_tags:
+                    raise LoopyError(
+                            f"Dim tags of '{arg.name}' and associated sep array "
+                            "'{sub_arg.name}' are not consistent.")
+
+                for attr_name in ["address_space", "is_input", "is_output"]:
+                    if getattr(arg, attr_name) != getattr(sub_arg, attr_name):
+                        raise LoopyError(
+                                "Attribute '{attr_name}' of "
+                                f"'{arg.name}' and associated sep array "
+                                f"'{sub_arg.name}' is not consistent.")
+
+
+@for_each_kernel
+def check_offsets_and_dim_tags(kernel: LoopKernel) -> None:
+    from loopy.symbolic import DependencyMapper
+    from pymbolic.primitives import Variable, Expression
+
+    arg_name_vars = {Variable(name) for name in kernel.arg_dict}
+    dep_mapper = DependencyMapper()
+
+    def ensure_depends_only_on_arguments(
+            what: str, expr: Union[str, ExpressionT]) -> None:
+        if isinstance(expr, str):
+            expr = Variable(expr)
+
+        deps = dep_mapper(expr)
+        if not deps <= arg_name_vars:
+            raise LoopyError(
+                    f"not all names in {what} are arguments: "
+                    + ", ".join(str(v) for v in deps - arg_name_vars))
+
+    # {{{ process arguments
+
+    new_args = []
+    for arg in kernel.args:
+        if isinstance(arg, ArrayArg):
+            what = f"offset of argument '{arg.name}'"
+            if arg.offset is None:
+                continue
+            if arg.offset is auto:
+                pass
+            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+                ensure_depends_only_on_arguments(what, arg.offset)
+
+            else:
+                raise LoopyError(f"invalid value of offset for '{arg.name}'")
+
+            if arg.dim_tags is None:
+                new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \
+                        arg.dim_tags
+            else:
+                new_dim_tags = ()
+                for iaxis, dim_tag in enumerate(arg.dim_tags):
+                    if isinstance(dim_tag, FixedStrideArrayDimTag):
+                        what = (f"stride for axis {iaxis+1} (1-based) of "
+                                        f"of argument '{arg.name}'")
+                        if dim_tag.stride is auto:
+                            pass
+                        elif isinstance(
+                                dim_tag.stride, (int, np.integer, Expression)):
+                            ensure_depends_only_on_arguments(what, dim_tag.stride)
+                        else:
+                            raise LoopyError(f"invalid value of {what}")
+
+                    assert new_dim_tags is not None
+                    new_dim_tags = new_dim_tags + (dim_tag,)
+
+            arg = arg.copy(dim_tags=new_dim_tags)
+
+        new_args.append(arg)
+
+    # }}}
+
+    # {{{ process temporary variables
+
+    for tv in kernel.temporary_variables.values():
+        what = f"offset of temporary '{tv.name}'"
+        if tv.offset is None:
+            pass
+        if tv.offset is auto:
+            pass
+        elif isinstance(tv.offset, (int, np.integer, Expression, str)):
+            ensure_depends_only_on_arguments(what, tv.offset)
+        else:
+            raise LoopyError(f"invalid value of offset for '{tv.name}'")
+
+        if tv.dim_tags is not None:
+            for iaxis, dim_tag in enumerate(tv.dim_tags):
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    what = ("axis stride for axis "
+                            f"{iaxis+1} (1-based) of temporary '{tv.name}'")
+                    if dim_tag.stride is auto:
+                        raise LoopyError(f"The {what}" f" is 'auto', "
+                                "which is not allowed.")
+                    elif isinstance(dim_tag.stride, (int, np.integer, Expression)):
+                        ensure_depends_only_on_arguments(what, dim_tag.stride)
+                    else:
+                        raise LoopyError(f"invalid value of {what}")
+
+                elif isinstance(dim_tag, SeparateArrayArrayDimTag):
+                    raise LoopyError(f"Axis {iaxis+1} of temporary "
+                            f"'{tv.name} is tagged 'sep'. This is not allowed.")
+
+    # }}}
+
 # }}}
 
 
@@ -465,24 +599,6 @@ def check_for_write_races(kernel):
                         WriteRaceConditionWarning)
 
 
-@for_each_kernel
-def check_for_orphaned_user_hardware_axes(kernel):
-    from loopy.kernel.data import LocalInameTag
-    for axis in kernel.local_sizes:
-        found = False
-        for iname in kernel.inames.values():
-            for tag in iname.tags:
-                if isinstance(tag, LocalInameTag) and tag.axis == axis:
-                    found = True
-                    break
-            if found:
-                break
-
-        if not found:
-            raise LoopyError("user-requested local hardware axis %d "
-                    "has no iname mapped to it" % axis)
-
-
 @for_each_kernel
 def check_for_data_dependent_parallel_bounds(kernel):
     """
@@ -1118,12 +1234,13 @@ def pre_schedule_checks(t_unit):
         check_for_integer_subscript_indices(t_unit)
 
         check_functions_are_resolved(t_unit)
+        check_separated_array_consistency(t_unit)
+        check_offsets_and_dim_tags(t_unit)
         # Ordering restriction:
         # check_sub_array_ref_inames_not_within_or_redn_inames should be done
         # before check_bounds. See: BatchedAccessMapMapper.map_sub_array_ref.
         check_sub_array_ref_inames_not_within_or_redn_inames(t_unit)
         check_for_duplicate_insn_ids(t_unit)
-        check_for_orphaned_user_hardware_axes(t_unit)
         check_for_double_use_of_hw_axes(t_unit)
         check_insn_attributes(t_unit)
         check_loop_priority_inames_known(t_unit)
@@ -1151,6 +1268,37 @@ def pre_schedule_checks(t_unit):
 
 # {{{ post-schedule / pre-code-generation checks
 
+# {{{ check_for_nested_base_storage
+
+def check_for_nested_base_storage(kernel: LoopKernel) -> None:
+    # must run after preprocessing has created variables for base_storage
+
+    from loopy.kernel.data import ArrayArg
+    arrays: List[ArrayBase] = [
+        arg for arg in kernel.args if isinstance(arg, ArrayArg)
+        ]
+    arrays = arrays + list(kernel.temporary_variables.values())
+
+    name_to_array = {ary.name: ary for ary in arrays}
+
+    for ary in kernel.temporary_variables.values():
+        if ary.base_storage:
+            storage_array = name_to_array.get(ary.base_storage, None)
+
+            if storage_array is None:
+                raise ValueError("nothing known about storage array "
+                        f"'{ary.base_storage}' serving as base_storage of "
+                        f"'{ary.name}'")
+
+            if storage_array.base_storage:
+                raise ValueError("storage array "
+                        f"'{storage_array.name}' serving as base_storage of "
+                        f"'{ary.name}' may not itself use base_storage "
+                        "(currently given as '{storage_array.base_storage}'")
+
+# }}}
+
+
 # {{{ check for unused hw axes
 
 def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
@@ -1540,8 +1688,6 @@ def _validate_kernel_call_sites_inner(kernel, callables):
 
 
 def validate_kernel_call_sites(translation_unit):
-    from loopy import LoopKernel
-
     for name in translation_unit.callables_table:
         clbl = translation_unit[name]
         if isinstance(clbl, LoopKernel):
@@ -1562,6 +1708,7 @@ def pre_codegen_entrypoint_checks(kernel, callables_table):
 def pre_codegen_callable_checks(kernel, callables_table):
     logger.debug("pre-codegen callable check %s: start" % kernel.name)
 
+    check_for_nested_base_storage(kernel)
     check_for_unused_hw_axes_in_insns(kernel, callables_table)
     check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
     check_that_temporaries_are_defined_in_subkernels_where_used(kernel)
diff --git a/loopy/cli.py b/loopy/cli.py
index d7e6d148c8de44bbdefbb40b32190faf42e4e34a..426fb2e3e2a04b7be33778ffd2e9a0776cbb827d 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -77,7 +77,7 @@ def main():
         target = ISPCTarget
     elif args.target == "ispc-occa":
         from loopy.target.ispc import ISPCTarget
-        target = lambda: ISPCTarget(occa_mode=True)  # noqa: E731
+        target = lambda: ISPCTarget()  # noqa: E731
     elif args.target == "c":
         from loopy.target.c import CTarget
         target = CTarget
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index edf59d62a17b4f1ddf8c92be927f8c1f367cb9d4..a700b153ab4b62ce9ea66cecd6ae2de70f4b5664 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -20,17 +20,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import sys
+from immutables import Map
+from typing import (Set, Mapping, Sequence, Any, FrozenSet, Union,
+       Optional, Tuple, TYPE_CHECKING)
+from dataclasses import dataclass, replace
 import logging
 logger = logging.getLogger(__name__)
 
 import islpy as isl
 
 from loopy.diagnostic import LoopyError, warn
-from pytools import ImmutableRecord
+from pytools import UniqueNameGenerator
 
 from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
+from loopy.types import LoopyType
+from loopy.typing import ExpressionT
+from loopy.kernel import LoopKernel
+from loopy.target import TargetBase
+from loopy.kernel.function_interface import InKernelCallable
 
 
 from loopy.symbolic import CombineMapper
@@ -40,11 +50,19 @@ from loopy.kernel.function_interface import CallableKernel
 
 from pytools import ProcessLogger
 
+if TYPE_CHECKING:
+    from loopy.codegen.tools import CodegenOperationCacheManager
+    from loopy.codegen.result import GeneratedProgram
+
+
+if getattr(sys, "_BUILDING_SPHINX_DOCS", False):
+    from loopy.codegen.tools import CodegenOperationCacheManager  # noqa: F811
+    from loopy.codegen.result import GeneratedProgram  # noqa: F811
+
+
 __doc__ = """
 .. currentmodule:: loopy.codegen
 
-.. autoclass:: ImplementedDataInfo
-
 .. autoclass:: PreambleInfo
 
 .. autoclass:: VectorizationInfo
@@ -61,84 +79,13 @@ __doc__ = """
 """
 
 
-# {{{ implemented data info
-
-class ImplementedDataInfo(ImmutableRecord):
-    """
-    .. attribute:: name
-
-        The expanded name of the array. Note that, for example
-        in the case of separate-array-tagged axes, multiple
-        implemented arrays may correspond to one user-facing
-        array.
-
-    .. attribute:: dtype
-
-    .. attribute:: arg_class
-
-    .. attribute:: base_name
-
-        The user-facing name of the underlying array.
-        May be *None* for non-array arguments.
-
-    .. attribute:: shape
-    .. attribute:: strides
-
-        Strides in multiples of ``dtype.itemsize``.
-
-    .. attribute:: unvec_shape
-    .. attribute:: unvec_strides
-
-        Strides in multiples of ``dtype.itemsize`` that accounts for
-        :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar
-        manner
-
-
-    .. attribute:: offset_for_name
-    .. attribute:: stride_for_name_and_axis
-
-        A tuple *(name, axis)* indicating the (implementation-facing)
-        name of the array and axis number for which this argument provides
-        the strides.
-
-    .. attribute:: allows_offset
-    .. attribute:: is_written
-    """
-
-    def __init__(self, target, name, dtype, arg_class,
-            base_name=None,
-            shape=None, strides=None,
-            unvec_shape=None, unvec_strides=None,
-            offset_for_name=None, stride_for_name_and_axis=None,
-            allows_offset=None,
-            is_written=None):
-
-        from loopy.types import LoopyType
-        assert isinstance(dtype, LoopyType)
-
-        ImmutableRecord.__init__(self,
-                name=name,
-                dtype=dtype,
-                arg_class=arg_class,
-                base_name=base_name,
-                shape=shape,
-                strides=strides,
-                unvec_shape=unvec_shape,
-                unvec_strides=unvec_strides,
-                offset_for_name=offset_for_name,
-                stride_for_name_and_axis=stride_for_name_and_axis,
-                allows_offset=allows_offset,
-                is_written=is_written)
-
-# }}}
-
-
 # {{{ code generation state
 
 class UnvectorizableError(Exception):
     pass
 
 
+@dataclass(frozen=True)
 class VectorizationInfo:
     """
     .. attribute:: iname
@@ -146,13 +93,14 @@ class VectorizationInfo:
     .. attribute:: space
     """
 
-    def __init__(self, iname, length, space):
-        self.iname = iname
-        self.length = length
-        self.space = space
+    iname: str
+    length: int
+    # FIXME why is this here?
+    space: isl.Space
 
 
-class SeenFunction(ImmutableRecord):
+@dataclass(frozen=True)
+class SeenFunction:
     """This is used to track functions that emerge late during code generation,
     e.g. C functions to realize arithmetic. No connection with
     :class:`~loopy.kernel.function_interface.InKernelCallable`.
@@ -167,23 +115,17 @@ class SeenFunction(ImmutableRecord):
 
         a tuple of result dtypes
     """
-
-    def __init__(self, name, c_name, arg_dtypes, result_dtypes):
-        ImmutableRecord.__init__(self,
-                name=name,
-                c_name=c_name,
-                arg_dtypes=arg_dtypes,
-                result_dtypes=result_dtypes)
+    name: str
+    c_name: str
+    arg_dtypes: Tuple[LoopyType, ...]
+    result_dtypes: Tuple[LoopyType, ...]
 
 
+@dataclass(frozen=True)
 class CodeGenerationState:
     """
     .. attribute:: kernel
     .. attribute:: target
-    .. attribute:: implemented_data_info
-
-        a list of :class:`ImplementedDataInfo` objects.
-
     .. attribute:: implemented_domain
 
         The entire implemented domain (as an :class:`islpy.Set`)
@@ -210,7 +152,8 @@ class CodeGenerationState:
 
     .. attribute:: vectorization_info
 
-        None or an instance of :class:`VectorizationInfo`
+        *None* (to mean vectorization has not yet been applied),  or an instance of
+        :class:`VectorizationInfo`.
 
     .. attribute:: is_generating_device_code
 
@@ -237,105 +180,48 @@ class CodeGenerationState:
         An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`.
     """
 
-    def __init__(self, kernel, target,
-            implemented_data_info, implemented_domain, implemented_predicates,
-            seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
-            allow_complex,
-            callables_table,
-            is_entrypoint,
-            vectorization_info=None, var_name_generator=None,
-            is_generating_device_code=None,
-            gen_program_name=None,
-            schedule_index_end=None,
-            codegen_cachemanager=None):
-        self.kernel = kernel
-        self.target = target
-        self.implemented_data_info = implemented_data_info
-        self.implemented_domain = implemented_domain
-        self.implemented_predicates = implemented_predicates
-        self.seen_dtypes = seen_dtypes
-        self.seen_functions = seen_functions
-        self.seen_atomic_dtypes = seen_atomic_dtypes
-        self.var_subst_map = var_subst_map.copy()
-        self.allow_complex = allow_complex
-        self.callables_table = callables_table
-        self.is_entrypoint = is_entrypoint
-        self.vectorization_info = vectorization_info
-        self.var_name_generator = var_name_generator
-        self.is_generating_device_code = is_generating_device_code
-        self.gen_program_name = gen_program_name
-        self.schedule_index_end = schedule_index_end
-        self.codegen_cachemanager = codegen_cachemanager
+    kernel: LoopKernel
+    target: TargetBase
+    implemented_domain: isl.Set
+    implemented_predicates: FrozenSet[Union[str, ExpressionT]]
+
+    # /!\ mutable
+    seen_dtypes: Set[LoopyType]
+    seen_functions: Set[SeenFunction]
+    seen_atomic_dtypes: Set[LoopyType]
+
+    var_subst_map: Map[str, ExpressionT]
+    allow_complex: bool
+    callables_table: Mapping[str, InKernelCallable]
+    is_entrypoint: bool
+    var_name_generator: UniqueNameGenerator
+    is_generating_device_code: bool
+    gen_program_name: str
+    schedule_index_end: int
+    codegen_cachemanager: "CodegenOperationCacheManager"
+    vectorization_info: Optional[VectorizationInfo] = None
+
+    def __post_init__(self):
+        # FIXME: If this doesn't bomb during testing, we can get rid of target.
+        assert self.target == self.kernel.target
+
+        assert self.vectorization_info is None or isinstance(
+                self.vectorization_info, VectorizationInfo)
 
     # {{{ copy helpers
 
-    def copy(self, kernel=None, target=None, implemented_data_info=None,
-            implemented_domain=None, implemented_predicates=frozenset(),
-            var_subst_map=None, is_entrypoint=None, vectorization_info=None,
-            is_generating_device_code=None, gen_program_name=None,
-            schedule_index_end=None):
-
-        if kernel is None:
-            kernel = self.kernel
-
-        if target is None:
-            target = self.target
-
-        if implemented_data_info is None:
-            implemented_data_info = self.implemented_data_info
-
-        if is_entrypoint is None:
-            is_entrypoint = self.is_entrypoint
-
-        if vectorization_info is False:
-            vectorization_info = None
-
-        elif vectorization_info is None:
-            vectorization_info = self.vectorization_info
-
-        if is_generating_device_code is None:
-            is_generating_device_code = self.is_generating_device_code
-
-        if gen_program_name is None:
-            gen_program_name = self.gen_program_name
-
-        if schedule_index_end is None:
-            schedule_index_end = self.schedule_index_end
-
-        return CodeGenerationState(
-                kernel=kernel,
-                target=target,
-                implemented_data_info=implemented_data_info,
-                implemented_domain=implemented_domain or self.implemented_domain,
-                implemented_predicates=(
-                    implemented_predicates or self.implemented_predicates),
-                seen_dtypes=self.seen_dtypes,
-                seen_functions=self.seen_functions,
-                seen_atomic_dtypes=self.seen_atomic_dtypes,
-                var_subst_map=var_subst_map or self.var_subst_map,
-                allow_complex=self.allow_complex,
-                callables_table=self.callables_table,
-                is_entrypoint=is_entrypoint,
-                vectorization_info=vectorization_info,
-                var_name_generator=self.var_name_generator,
-                is_generating_device_code=is_generating_device_code,
-                gen_program_name=gen_program_name,
-                schedule_index_end=schedule_index_end,
-                codegen_cachemanager=self.codegen_cachemanager.with_kernel(kernel),
-                )
+    def copy(self, **kwargs: Any) -> "CodeGenerationState":
+        return replace(self, **kwargs)
 
-    def copy_and_assign(self, name, value):
+    def copy_and_assign(
+            self, name: str, value: ExpressionT) -> "CodeGenerationState":
         """Make a copy of self with variable *name* fixed to *value*."""
-        var_subst_map = self.var_subst_map.copy()
-        var_subst_map[name] = value
-        return self.copy(var_subst_map=var_subst_map)
+        return self.copy(var_subst_map=self.var_subst_map.set(name, value))
 
-    def copy_and_assign_many(self, assignments):
+    def copy_and_assign_many(self, assignments) -> "CodeGenerationState":
         """Make a copy of self with *assignments* included."""
 
-        var_subst_map = self.var_subst_map.copy()
-        var_subst_map.update(assignments)
-        return self.copy(var_subst_map=var_subst_map)
+        return self.copy(var_subst_map=self.var_subst_map.update(assignments))
 
     # }}}
 
@@ -396,8 +282,10 @@ class CodeGenerationState:
 
     def unvectorize(self, func):
         vinf = self.vectorization_info
+        assert vinf is not None
+
         result = []
-        novec_self = self.copy(vectorization_info=False)
+        novec_self = self.copy(vectorization_info=None)
 
         for i in range(vinf.length):
             idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
@@ -453,14 +341,15 @@ class InKernelCallablesCollector(CombineMapper):
     map_type_cast = map_constant
 
 
-class PreambleInfo(ImmutableRecord):
-    """
-    .. attribute:: kernel
-    .. attribute:: seen_dtypes
-    .. attribute:: seen_functions
-    .. attribute:: seen_atomic_dtypes
-    .. attribute:: codegen_state
-    """
+@dataclass(frozen=True)
+class PreambleInfo:
+    kernel: LoopKernel
+    seen_dtypes: Set[LoopyType]
+    seen_functions: Set[SeenFunction]
+    seen_atomic_dtypes: Set[LoopyType]
+
+    # FIXME: This makes all the above redundant. It probably shouldn't be here.
+    codegen_state: CodeGenerationState
 
 
 # {{{ main code generation entrypoint
@@ -480,47 +369,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
 
     codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code")
 
-    # {{{ pre-codegen-process of non-entrypoint kernel
-
-    if not is_entrypoint:
-        from loopy.kernel.array import ArrayBase
-        from loopy.kernel.data import auto
-
-        new_args = [arg.copy(offset=0 if arg.offset is auto else arg.offset)
-                    if isinstance(arg, ArrayBase)
-                    else arg
-                    for arg in kernel.args]
-        kernel = kernel.copy(args=new_args)
-
-    # }}}
-
     # {{{ examine arg list
 
-    from loopy.kernel.data import ValueArg
-    from loopy.kernel.array import ArrayBase
-
-    implemented_data_info = []
-
-    for arg in kernel.args:
-        is_written = arg.name in kernel.get_written_variables()
-        if isinstance(arg, ArrayBase):
-            implemented_data_info.extend(
-                    arg.decl_info(
-                        target,
-                        is_written=is_written,
-                        index_dtype=kernel.index_dtype))
-
-        elif isinstance(arg, ValueArg):
-            implemented_data_info.append(ImplementedDataInfo(
-                target=target,
-                name=arg.name,
-                dtype=arg.dtype,
-                arg_class=ValueArg,
-                is_written=is_written))
-
-        else:
-            raise ValueError("argument type not understood: '%s'" % type(arg))
-
     allow_complex = False
     for var in kernel.args + list(kernel.temporary_variables.values()):
         if var.dtype.involves_complex():
@@ -539,13 +389,12 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
     codegen_state = CodeGenerationState(
             kernel=kernel,
             target=target,
-            implemented_data_info=implemented_data_info,
             implemented_domain=initial_implemented_domain,
             implemented_predicates=frozenset(),
             seen_dtypes=seen_dtypes,
             seen_functions=seen_functions,
             seen_atomic_dtypes=seen_atomic_dtypes,
-            var_subst_map={},
+            var_subst_map=Map(),
             allow_complex=allow_complex,
             var_name_generator=kernel.get_var_name_generator(),
             is_generating_device_code=False,
@@ -573,17 +422,16 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
 
     # {{{ handle preambles
 
-    for idi in codegen_state.implemented_data_info:
-        seen_dtypes.add(idi.dtype)
+    for arg in kernel.args:
+        seen_dtypes.add(arg.dtype)
 
     for tv in kernel.temporary_variables.values():
-        for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype):
-            seen_dtypes.add(idi.dtype)
+        seen_dtypes.add(tv.dtype)
 
     if kernel.all_inames():
         seen_dtypes.add(kernel.index_dtype)
 
-    preambles = kernel.preambles[:]
+    preambles = list(kernel.preambles)
 
     preamble_info = PreambleInfo(
             kernel=kernel,
@@ -594,8 +442,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
             codegen_state=codegen_state
             )
 
-    preamble_generators = (kernel.preamble_generators
-            + target.get_device_ast_builder().preamble_generators())
+    preamble_generators = (list(kernel.preamble_generators)
+            + list(target.get_device_ast_builder().preamble_generators()))
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -648,7 +496,8 @@ def diverge_callee_entrypoints(program):
     return program.copy(callables_table=new_callables)
 
 
-class TranslationUnitCodeGenerationResult(ImmutableRecord):
+@dataclass(frozen=True)
+class TranslationUnitCodeGenerationResult:
     """
     .. attribute:: host_program
 
@@ -663,16 +512,16 @@ class TranslationUnitCodeGenerationResult(ImmutableRecord):
     .. attribute:: host_preambles
     .. attribute:: device_preambles
 
-    .. attribute:: implemented_data_infos
-
-        A mapping from names of entrypoints to their
-        list of :class:`ImplementedDataInfo` objects.
-
     .. automethod:: host_code
     .. automethod:: device_code
     .. automethod:: all_code
 
     """
+    host_programs: Mapping[str, "GeneratedProgram"]
+    device_programs: Sequence["GeneratedProgram"]
+    host_preambles: Sequence[Tuple[int, str]] = ()
+    device_preambles: Sequence[Tuple[int, str]] = ()
+
     def host_code(self):
         from loopy.codegen.result import process_preambles
         preamble_codes = process_preambles(getattr(self, "host_preambles", []))
@@ -695,9 +544,9 @@ class TranslationUnitCodeGenerationResult(ImmutableRecord):
     def all_code(self):
         from loopy.codegen.result import process_preambles
         preamble_codes = process_preambles(
-                getattr(self, "host_preambles", [])
+                tuple(getattr(self, "host_preambles", ()))
                 +
-                getattr(self, "device_preambles", [])
+                tuple(getattr(self, "device_preambles", ()))
                 )
 
         return (
@@ -768,7 +617,6 @@ def generate_code_v2(program):
     device_programs = []
     device_preambles = []
     callee_fdecls = []
-    implemented_data_infos = {}
 
     # {{{ collect host/device programs
 
@@ -780,7 +628,6 @@ def generate_code_v2(program):
                                                 func_id in program.entrypoints)
         if func_id in program.entrypoints:
             host_programs[func_id] = cgr.host_program
-            implemented_data_infos[func_id] = cgr.implemented_data_info
         else:
             assert len(cgr.device_programs) == 1
             callee_fdecls.append(cgr.device_programs[0].ast.fdecl)
@@ -805,8 +652,7 @@ def generate_code_v2(program):
     cgr = TranslationUnitCodeGenerationResult(
             host_programs=host_programs,
             device_programs=device_programs,
-            device_preambles=device_preambles,
-            implemented_data_infos=implemented_data_infos)
+            device_preambles=device_preambles)
 
     if CACHING_ENABLED:
         code_gen_cache.store_if_not_present(input_program, cgr)
@@ -820,6 +666,11 @@ def generate_code(kernel, device=None):
         warn("passing 'device' to generate_code() is deprecated",
                 DeprecationWarning, stacklevel=2)
 
+    if device is not None:
+        from warnings import warn
+        warn("generate_code is deprecated and will stop working in 2023. "
+                "Call generate_code_v2 instead.", DeprecationWarning, stacklevel=2)
+
     codegen_result = generate_code_v2(kernel)
 
     if len(codegen_result.device_programs) > 1:
@@ -829,10 +680,7 @@ def generate_code(kernel, device=None):
         raise LoopyError("kernel passed to generate_code yielded multiple "
                 "host programs. Use generate_code_v2.")
 
-    assert len(codegen_result.implemented_data_infos) == 1
-    implemented_data_info, = codegen_result.implemented_data_infos.values()
-
-    return codegen_result.device_code(), implemented_data_info
+    return codegen_result.device_code(), None
 
 # }}}
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 8d98196f2d74955867c44ac6d2e66ab259a07bb7..9f597f8478c1b0524112c8bd261a17e132e81fc3 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -23,47 +23,15 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
+
+from loopy.codegen.result import merge_codegen_results, wrap_in_if
 from loopy.schedule import (
         EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel,
         gather_schedule_block, generate_sub_sched_items)
 from loopy.diagnostic import LoopyError
 
 
-def synthesize_idis_for_extra_args(kernel, schedule_index):
-    """
-    :returns: A list of :class:`loopy.codegen.ImplementedDataInfo`
-    """
-    sched_item = kernel.linearization[schedule_index]
-
-    from loopy.codegen import ImplementedDataInfo
-    from loopy.kernel.data import InameArg, AddressSpace
-
-    assert isinstance(sched_item, CallKernel)
-
-    idis = []
-
-    for arg in sched_item.extra_args:
-        temporary = kernel.temporary_variables[arg]
-        assert temporary.address_space == AddressSpace.GLOBAL
-        idis.extend(
-            temporary.decl_info(
-                kernel.target,
-                index_dtype=kernel.index_dtype))
-
-    for iname in sched_item.extra_inames:
-        idis.append(
-            ImplementedDataInfo(
-                target=kernel.target,
-                name=iname,
-                dtype=kernel.index_dtype,
-                arg_class=InameArg,
-                is_written=False))
-
-    return idis
-
-
 def generate_code_for_sched_index(codegen_state, sched_index):
     kernel = codegen_state.kernel
     sched_item = kernel.linearization[sched_index]
@@ -75,14 +43,11 @@ def generate_code_for_sched_index(codegen_state, sched_index):
         _, past_end_i = gather_schedule_block(kernel.linearization, sched_index)
         assert past_end_i <= codegen_state.schedule_index_end
 
-        extra_args = synthesize_idis_for_extra_args(kernel, sched_index)
-
         new_codegen_state = codegen_state.copy(
                 is_generating_device_code=True,
                 gen_program_name=sched_item.kernel_name,
                 schedule_index_end=past_end_i-1,
-                implemented_data_info=(codegen_state.implemented_data_info
-                    + extra_args))
+                )
 
         from loopy.codegen.result import generate_host_or_device_program
         codegen_result = generate_host_or_device_program(
@@ -98,8 +63,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 codegen_state.ast_builder.get_kernel_call(
                     codegen_state,
                     sched_item.kernel_name,
-                    glob_grid, loc_grid,
-                    extra_args),
+                    glob_grid, loc_grid)
                 ])
         else:
             # do not generate host code for non-entrypoint kernels
@@ -157,8 +121,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 return CodeGenerationResult(
                         host_program=None,
                         device_programs=[],
-                        implemented_domains={},
-                        implemented_data_info=codegen_state.implemented_data_info)
+                        implemented_domains={})
 
             else:
                 raise LoopyError("do not know how to emit code for barrier "
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 7523c11d7e4ca9764abbe9cec4a917ade8128974..e19686929d787a06cb562ebfc975ffe509e482c6 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -20,10 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from pytools import ImmutableRecord
+from typing import Any, Sequence, Mapping, Tuple, Optional
+from dataclasses import dataclass, replace
 
+import islpy as isl
 
-def process_preambles(preambles):
+
+def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]:
     seen_preamble_tags = set()
     dedup_preambles = []
 
@@ -55,7 +58,8 @@ __doc__ = """
 
 # {{{ code generation result
 
-class GeneratedProgram(ImmutableRecord):
+@dataclass(frozen=True)
+class GeneratedProgram:
     """
     .. attribute:: name
 
@@ -73,8 +77,17 @@ class GeneratedProgram(ImmutableRecord):
         the overall function definition.
     """
 
+    name: str
+    is_device_program: bool
+    ast: Any
+    body_ast: Optional[Any] = None
+
+    def copy(self, **kwargs: Any) -> "GeneratedProgram":
+        return replace(self, **kwargs)
 
-class CodeGenerationResult(ImmutableRecord):
+
+@dataclass(frozen=True)
+class CodeGenerationResult:
     """
     .. attribute:: host_program
     .. attribute:: device_programs
@@ -93,12 +106,15 @@ class CodeGenerationResult(ImmutableRecord):
     .. automethod:: host_code
     .. automethod:: device_code
     .. automethod:: all_code
-
-    .. attribute:: implemented_data_info
-
-        a list of :class:`loopy.codegen.ImplementedDataInfo` objects.
-        Only added at the very end of code generation.
     """
+    host_program: Optional[GeneratedProgram]
+    device_programs: Sequence[GeneratedProgram]
+    implemented_domains: Mapping[str, isl.Set]
+    host_preambles: Sequence[Tuple[int, str]] = ()
+    device_preambles: Sequence[Tuple[int, str]] = ()
+
+    def copy(self, **kwargs: Any) -> "CodeGenerationResult":
+        return replace(self, **kwargs)
 
     @staticmethod
     def new(codegen_state, insn_id, ast, implemented_domain):
@@ -119,12 +135,12 @@ class CodeGenerationResult(ImmutableRecord):
                     }
 
         return CodeGenerationResult(
-                implemented_data_info=codegen_state.implemented_data_info,
                 implemented_domains={insn_id: [implemented_domain]},
                 **kwargs)
 
     def host_code(self):
-        preamble_codes = process_preambles(getattr(self, "host_preambles", []))
+        assert self.host_program is not None
+        preamble_codes = process_preambles(self.host_preambles)
 
         return (
                 "".join(preamble_codes)
@@ -132,7 +148,7 @@ class CodeGenerationResult(ImmutableRecord):
                 str(self.host_program.ast))
 
     def device_code(self):
-        preamble_codes = process_preambles(getattr(self, "device_preambles", []))
+        preamble_codes = process_preambles(self.device_preambles)
 
         return (
                 "".join(preamble_codes)
@@ -140,6 +156,7 @@ class CodeGenerationResult(ImmutableRecord):
                 + "\n\n".join(str(dp.ast) for dp in self.device_programs))
 
     def all_code(self):
+        assert self.host_program is not None
         preamble_codes = process_preambles(
                 getattr(self, "host_preambles", [])
                 +
@@ -178,7 +195,7 @@ class CodeGenerationResult(ImmutableRecord):
             assert program.is_device_program
             return self.copy(
                     device_programs=(
-                        self.device_programs[:-1]
+                        list(self.device_programs[:-1])
                         +
                         [program]))
         else:
@@ -207,8 +224,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
         return CodeGenerationResult(
                 host_program=None,
                 device_programs=[],
-                implemented_domains={},
-                implemented_data_info=codegen_state.implemented_data_info)
+                implemented_domains={})
 
     ast_els = []
     new_device_programs = []
@@ -260,7 +276,6 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
             .with_new_ast(codegen_state, ast)
             .copy(
                 implemented_domains=implemented_domains,
-                implemented_data_info=codegen_state.implemented_data_info,
                 **kwargs))
 
 
@@ -330,3 +345,5 @@ def generate_host_or_device_program(codegen_state, schedule_index):
     return codegen_result
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/codegen/tools.py b/loopy/codegen/tools.py
index ba8e1cfc9270ad5a5b14f6b13eb5fa5fa907cb39..d9206ed4d93b5095c315433ba63f077245068349 100644
--- a/loopy/codegen/tools.py
+++ b/loopy/codegen/tools.py
@@ -20,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from functools import cached_property
 from pytools import memoize_method
+
 from loopy.schedule import (EnterLoop, LeaveLoop, CallKernel, ReturnFromKernel,
                             Barrier, BeginBlockItem, gather_schedule_block,
                             ScheduleItem)
@@ -40,7 +42,7 @@ __doc__ = """
 """
 
 
-@dataclass
+@dataclass(frozen=True)
 class KernelProxyForCodegenOperationCacheManager:
     """
     Proxy to :class:`loopy.LoopKernel` to be used by
@@ -50,8 +52,7 @@ class KernelProxyForCodegenOperationCacheManager:
     linearization: List[ScheduleItem]
     inames: Dict[str, Iname]
 
-    @property
-    @memoize_method
+    @cached_property
     def id_to_insn(self):
         return {insn.id: insn for insn in self.instructions}
 
@@ -111,8 +112,7 @@ class CodegenOperationCacheManager:
 
         return self
 
-    @property
-    @memoize_method
+    @cached_property
     def active_inames(self):
         """
         Returns an instance of :class:`list`, with the i-th entry being a
@@ -139,8 +139,7 @@ class CodegenOperationCacheManager:
 
         return active_inames
 
-    @property
-    @memoize_method
+    @cached_property
     def callkernel_index(self):
         """
         Returns an instance of :class:`list`, with the i-th entry being the index of
@@ -165,8 +164,7 @@ class CodegenOperationCacheManager:
 
         return callkernel_index
 
-    @property
-    @memoize_method
+    @cached_property
     def has_barrier_within(self):
         """
         Returns an instance of :class:`list`. The list's i-th entry is *True* if the
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index a2e631587361e76e258a39f95f00cba5d99e6f3a..02a82ab8a74d78b1342fc09b92c19c663d7f2209 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -22,78 +22,47 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from functools import cached_property
+from enum import IntEnum
 from sys import intern
+from typing import (
+        Dict, Sequence, Tuple, Mapping, Optional, FrozenSet, Any, Union,
+        Callable, Iterator, List, Set, TYPE_CHECKING)
+from dataclasses import dataclass, replace, field, fields
+from warnings import warn
 
 from collections import defaultdict
 
 import numpy as np
-from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method
-from pytools.tag import Taggable
+from pytools import (memoize_method,
+        UniqueNameGenerator, generate_unique_names, natsorted)
+from pytools.tag import Taggable, Tag
 import islpy as isl
 from islpy import dim_type
-import re
-
-from pytools import UniqueNameGenerator, generate_unique_names, natsorted
+from immutables import Map
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 from loopy.tools import update_persistent_hash
 from loopy.diagnostic import StaticValueFindingError
-from loopy.kernel.data import filter_iname_tags_by_type, Iname
-from warnings import warn
-
-
-# {{{ unique var names
-
-class _UniqueVarNameGenerator(UniqueNameGenerator):
-
-    def __init__(self, existing_names=frozenset(), forced_prefix=""):
-        super().__init__(existing_names, forced_prefix)
-        array_prefix_pattern = re.compile("(.*)_s[0-9]+$")
-
-        array_prefixes = set()
-        for name in existing_names:
-            match = array_prefix_pattern.match(name)
-            if match is None:
-                continue
-
-            array_prefixes.add(match.group(1))
-
-        self.conflicting_array_prefixes = array_prefixes
-        self.array_prefix_pattern = array_prefix_pattern
-
-    def _name_added(self, name):
-        match = self.array_prefix_pattern.match(name)
-        if match is None:
-            return
-
-        self.conflicting_array_prefixes.add(match.group(1))
-
-    def is_name_conflicting(self, name):
-        if name in self.existing_names:
-            return True
-
-        # Array dimensions implemented as separate arrays generate
-        # names by appending '_s<NUMBER>'. Make sure that no
-        # conflicts can arise from these names.
-
-        # Case 1: a_s0 is already a name; we are trying to insert a
-        # Case 2: a is already a name; we are trying to insert a_s0
-
-        if name in self.conflicting_array_prefixes:
-            return True
-
-        match = self.array_prefix_pattern.match(name)
-        if match is None:
-            return False
-
-        return match.group(1) in self.existing_names
-
-# }}}
-
+from loopy.kernel.data import (
+        _ArraySeparationInfo,
+        KernelArgument,
+        filter_iname_tags_by_type, Iname,
+        TemporaryVariable, ValueArg, ArrayArg, SubstitutionRule)
+from loopy.kernel.instruction import InstructionBase
+from loopy.types import LoopyType, NumpyType
+from loopy.options import Options
+from loopy.schedule import ScheduleItem
+from loopy.typing import ExpressionT
+from loopy.target import TargetBase
+
+if TYPE_CHECKING:
+    from loopy.kernel.function_interface import InKernelCallable
+    from loopy.codegen import PreambleInfo
 
 # {{{ loop kernel object
 
-class KernelState:  # noqa
+class KernelState(IntEnum):  # noqa
     INITIAL = 0
     CALLS_RESOLVED = 1
     PREPROCESSED = 2
@@ -105,11 +74,18 @@ def _get_inames_from_domains(domains):
             (frozenset(dom.get_var_names(dim_type.set)) for dom in domains))
 
 
-class _not_provided:  # noqa: N801
-    pass
+@dataclass(frozen=True)
+class _BoundsRecord:
+    lower_bound_pw_aff: isl.PwAff
+    upper_bound_pw_aff: isl.PwAff
+    size: isl.PwAff
 
 
-class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
+PreambleGenerator = Callable[["PreambleInfo"], Iterator[Tuple[int, str]]]
+
+
+@dataclass(frozen=True)
+class LoopKernel(Taggable):
     """These correspond more or less directly to arguments of
     :func:`loopy.make_kernel`.
 
@@ -144,7 +120,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
         A :class:`islpy.BasicSet` parameter domain.
 
-    .. attribute:: local_sizes
     .. attribute:: temporary_variables
 
         A :class:`dict` of mapping variable names to
@@ -179,7 +154,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         were applied to the kernel. These are stored so that they may be repeated
         on expressions the user specifies later.
 
-    .. attribute:: cache_manager
     .. attribute:: options
 
         An instance of :class:`loopy.Options`
@@ -204,162 +178,55 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
     .. automethod:: tagged
     .. automethod:: without_tags
     """
-
-    # {{{ constructor
-
-    def __init__(self, domains, instructions, args=None,
-            schedule=None,
-            linearization=None,
-            name="loopy_kernel",
-            preambles=None,
-            preamble_generators=None,
-            assumptions=None,
-            local_sizes=None,
-            temporary_variables=None,
-            inames=None,
-            iname_to_tags=None,
-            substitutions=None,
-            symbol_manglers=None,
-
-            iname_slab_increments=None,
-            loop_priority=frozenset(),
-            silenced_warnings=None,
-
-            applied_iname_rewrites=None,
-            cache_manager=None,
-            index_dtype=None,
-            options=None,
-
-            state=KernelState.INITIAL,
-            target=None,
-
-            overridden_get_grid_sizes_for_insn_ids=None,
-            _cached_written_variables=None,
-            tags=frozenset()):
-        """
-        :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get
-            intersected in slab decomposition, their grid sizes shouldn't
-            change. This provides a way to forward sub-kernel grid size requests.
-        """
-
-        # {{{ process constructor arguments
-
-        if args is None:
-            args = []
-        if preambles is None:
-            preambles = []
-        if preamble_generators is None:
-            preamble_generators = []
-        if local_sizes is None:
-            local_sizes = {}
-        if temporary_variables is None:
-            temporary_variables = {}
-        if substitutions is None:
-            substitutions = {}
-        if symbol_manglers is None:
-            symbol_manglers = []
-        if iname_slab_increments is None:
-            iname_slab_increments = {}
-
-        if silenced_warnings is None:
-            silenced_warnings = []
-        if applied_iname_rewrites is None:
-            applied_iname_rewrites = []
-
-        if cache_manager is None:
-            from loopy.kernel.tools import SetOperationCacheManager
-            cache_manager = SetOperationCacheManager()
-
-        if iname_to_tags is not None:
-            warn("Providing iname_to_tags is deprecated, pass inames instead. "
-                    "Will be unsupported in 2022.",
-                    DeprecationWarning, stacklevel=2)
-
-            if inames is not None:
-                raise LoopyError("Cannot provide both iname_to_tags and inames to "
-                        "LoopKernel.__init__")
-
-            inames = {
-                name: inames.get(name, Iname(name, frozenset()))
-                for name in _get_inames_from_domains(domains)}
-
-        assert isinstance(inames, dict)
-
-        if index_dtype is None:
-            index_dtype = np.int32
-
-        # }}}
-
-        assert isinstance(assumptions, isl.BasicSet)
-        assert assumptions.is_params()
-
-        from loopy.types import to_loopy_type
-        index_dtype = to_loopy_type(index_dtype)
-        if not index_dtype.is_integral():
+    domains: Sequence[isl.BasicSet]
+    instructions: Sequence[InstructionBase]
+    args: Sequence[KernelArgument]
+    assumptions: isl.BasicSet
+    temporary_variables: Mapping[str, TemporaryVariable]
+    inames: Mapping[str, Iname]
+    substitutions: Mapping[str, SubstitutionRule]
+    options: Options
+    target: TargetBase
+    tags: FrozenSet[Tag]
+    state: KernelState = KernelState.INITIAL
+    name: str = "loopy_kernel"
+
+    preambles: Sequence[Tuple[int, str]] = ()
+    preamble_generators: Sequence[PreambleGenerator] = ()
+    symbol_manglers: Sequence[
+            Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = ()
+    linearization: Optional[Sequence[ScheduleItem]] = None
+    iname_slab_increments: Mapping[str, Tuple[int, int]] = field(
+            default_factory=Map)
+    loop_priority: FrozenSet[Tuple[str]] = field(
+            default_factory=frozenset)
+    applied_iname_rewrites: Sequence[Dict[str, ExpressionT]] = ()
+    index_dtype: NumpyType = NumpyType(np.dtype(np.int32))
+    silenced_warnings: FrozenSet[str] = frozenset()
+
+    # FIXME Yuck, this should go.
+    overridden_get_grid_sizes_for_insn_ids: Optional[
+            Callable[
+                [FrozenSet[str],
+                    Dict[str, "InKernelCallable"],
+                    bool],
+                Tuple[Tuple[int, ...], Tuple[int, ...]]]] = None
+
+    def __post_init__(self):
+        assert isinstance(self.assumptions, isl.BasicSet)
+        assert self.assumptions.is_params()
+
+        if not self.index_dtype.is_integral():
             raise TypeError("index_dtype must be an integer")
-        if np.iinfo(index_dtype.numpy_dtype).min >= 0:
+        if np.iinfo(self.index_dtype.numpy_dtype).min >= 0:
             raise TypeError("index_dtype must be signed")
 
-        if state not in [
-                KernelState.INITIAL,
-                KernelState.CALLS_RESOLVED,
-                KernelState.PREPROCESSED,
-                KernelState.LINEARIZED,
-                ]:
-            raise ValueError("invalid value for 'state'")
-
-        if linearization is not None:
-            if schedule is not None:
-                # these should not both be present
-                raise ValueError(
-                    "Received both 'schedule' and 'linearization' args. "
-                    "'schedule' is deprecated and will be removed "
-                    "in 2022. Pass 'linearization' only instead.")
-        elif schedule is not None:
-            warn(
-                "'schedule' is deprecated and will be removed in 2022. "
-                "Use 'linearization' instead.",
-                DeprecationWarning, stacklevel=2)
-            linearization = schedule
-
-        assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT
-
-        super().__init__(
-                domains=domains,
-                instructions=instructions,
-                args=args,
-                linearization=linearization,
-                name=name,
-                preambles=preambles,
-                preamble_generators=preamble_generators,
-                assumptions=assumptions,
-                iname_slab_increments=iname_slab_increments,
-                loop_priority=loop_priority,
-                silenced_warnings=silenced_warnings,
-                temporary_variables=temporary_variables,
-                local_sizes=local_sizes,
-                inames=inames,
-                substitutions=substitutions,
-                cache_manager=cache_manager,
-                applied_iname_rewrites=applied_iname_rewrites,
-                symbol_manglers=symbol_manglers,
-                index_dtype=index_dtype,
-                options=options,
-                state=state,
-                target=target,
-                overridden_get_grid_sizes_for_insn_ids=(
-                    overridden_get_grid_sizes_for_insn_ids),
-                _cached_written_variables=_cached_written_variables,
-                tags=tags)
-
-        self._kernel_executor_cache = {}
-
-    # }}}
+        assert self.assumptions.get_ctx() == isl.DEFAULT_CONTEXT
 
     # {{{ symbol mangling
 
     def mangle_symbol(self, ast_builder, identifier):
-        manglers = ast_builder.symbol_manglers() + self.symbol_manglers
+        manglers = ast_builder.symbol_manglers() + list(self.symbol_manglers)
 
         for mangler in manglers:
             result = mangler(self, identifier)
@@ -378,18 +245,15 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
                 | set(self.temporary_variables.keys()))
 
     @memoize_method
-    def all_variable_names(self, include_temp_storage=True):
+    def all_variable_names(self):
         return (
                 set(self.temporary_variables.keys())
-                | {tv.base_storage
-                    for tv in self.temporary_variables.values()
-                    if tv.base_storage is not None and include_temp_storage}
                 | set(self.substitutions.keys())
                 | {arg.name for arg in self.args}
                 | set(self.all_inames()))
 
     def get_var_name_generator(self):
-        return _UniqueVarNameGenerator(self.all_variable_names())
+        return UniqueNameGenerator(self.all_variable_names())
 
     def get_instruction_id_generator(self, based_on="insn"):
         used_ids = {insn.id for insn in self.instructions}
@@ -416,9 +280,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         return frozenset(result)
 
     def get_group_name_generator(self):
-        return _UniqueVarNameGenerator(set(self.all_group_names()))
+        return UniqueNameGenerator(set(self.all_group_names()))
 
-    def get_var_descriptor(self, name):
+    def get_var_descriptor(
+            self, name: str) -> Union[TemporaryVariable, KernelArgument]:
         try:
             return self.arg_dict[name]
         except KeyError:
@@ -439,15 +304,13 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         try:
             dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(),
                     name)
-            from loopy import ValueArg
             return ValueArg(name, dtype)
         except TypeError:
             pass
 
         raise ValueError("nothing known about variable '%s'" % name)
 
-    @property
-    @memoize_method
+    @cached_property
     def id_to_insn(self):
         return {insn.id: insn for insn in self.instructions}
 
@@ -456,7 +319,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
     # {{{ domain wrangling
 
     @memoize_method
-    def parents_per_domain(self):
+    def parents_per_domain(self) -> Sequence[Optional[int]]:
         """Return a list corresponding to self.domains (by index)
         containing domain indices which are nested around this
         domain.
@@ -470,8 +333,8 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         # determines the granularity of inames to be popped/decactivated
         # if we ascend a level.
 
-        iname_set_stack = []
-        result = []
+        iname_set_stack: List[Set[str]] = []
+        result: List[Optional[int]] = []
 
         from loopy.kernel.tools import is_domain_dependent_on_inames
 
@@ -543,24 +406,24 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         return result
 
     @memoize_method
-    def _get_home_domain_map(self):
+    def _get_home_domain_map(self) -> Mapping[str, int]:
         return {
                 iname: i_domain
                 for i_domain, dom in enumerate(self.domains)
                 for iname in dom.get_var_names(dim_type.set)}
 
-    def get_home_domain_index(self, iname):
+    def get_home_domain_index(self, iname: str) -> int:
         return self._get_home_domain_map()[iname]
 
     @property
-    def isl_context(self):
+    def isl_context(self) -> isl.Context:
         for dom in self.domains:
             return dom.get_ctx()
 
         raise AssertionError()
 
     @memoize_method
-    def combine_domains(self, domains):
+    def combine_domains(self, domains: Sequence[int]) -> isl.BasicSet:
         """
         :arg domains: domain indices of domains to be combined. More 'dominant'
             domains (those which get most say on the actual dim_type of an iname)
@@ -584,7 +447,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
         return result
 
-    def get_inames_domain(self, inames):
+    def get_inames_domain(self, inames: FrozenSet[str]) -> isl.BasicSet:
         if not inames:
             return self.combine_domains(())
 
@@ -665,18 +528,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
     # {{{ iname wrangling
 
-    @property
-    @memoize_method
-    def iname_to_tags(self):
-        warn(
-                "LoopKernel.iname_to_tags is deprecated. "
-                "Call LoopKernel.inames instead, "
-                "will be unsupported in 2022.",
-                DeprecationWarning, stacklevel=2)
-        return {name: iname.tags
-                for name, iname in self.inames.items()
-                if iname.tags}
-
     def iname_tags(self, iname):
         return self.inames[iname].tags
 
@@ -865,15 +716,20 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
         return result
 
-    @memoize_method
     def get_written_variables(self):
-        if self._cached_written_variables is not None:
+        try:
             return self._cached_written_variables
+        except AttributeError:
+            pass
 
-        return frozenset(
+        result = {
                 var_name
                 for insn in self.instructions
-                for var_name in insn.assignee_var_names())
+                for var_name in insn.assignee_var_names()}
+
+        object.__setattr__(self, "_cached_written_variables", result)
+
+        return result
 
     @memoize_method
     def get_temporary_to_base_storage_map(self):
@@ -888,7 +744,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
     def get_unwritten_value_args(self):
         written_vars = self.get_written_variables()
 
-        from loopy.kernel.data import ValueArg
         return {
                 arg.name
                 for arg in self.args
@@ -898,16 +753,12 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
     # {{{ argument wrangling
 
-    @property
-    @memoize_method
-    def arg_dict(self):
+    @cached_property
+    def arg_dict(self) -> Dict[str, KernelArgument]:
         return {arg.name: arg for arg in self.args}
 
-    @property
-    @memoize_method
+    @cached_property
     def scalar_loop_args(self):
-        from loopy.kernel.data import ValueArg
-
         if self.args is None:
             return []
         else:
@@ -936,6 +787,18 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
     # {{{ bounds finding
 
+    @property
+    def cache_manager(self):
+        try:
+            return self._cache_manager
+        except AttributeError:
+            pass
+
+        from loopy.kernel.tools import SetOperationCacheManager
+        cm = SetOperationCacheManager()
+        object.__setattr__(self, "_cache_manager", cm)
+        return cm
+
     @memoize_method
     def get_iname_bounds(self, iname, constants_only=False):
         domain = self.get_inames_domain(frozenset([iname]))
@@ -963,13 +826,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
                     dom_intersect_assumptions, iname_idx)
                 .coalesce())
 
-        class BoundsRecord(ImmutableRecord):
-            pass
-
         size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
         size = size.gist(assumptions)
 
-        return BoundsRecord(
+        return _BoundsRecord(
                 lower_bound_pw_aff=lower_bound_pw_aff,
                 upper_bound_pw_aff=upper_bound_pw_aff,
                 size=size)
@@ -1074,19 +934,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
             tgt_dict[tag.axis] = size
 
-        # {{{ override local_sizes with self.local_sizes
-
-        for i_lsize, lsize in self.local_sizes.items():
-            if i_lsize <= max(local_sizes.keys()):
-                local_sizes[i_lsize] = lsize
-            else:
-                from warnings import warn
-                warn(f"Forced local sizes '{i_lsize}: {lsize}' is unused"
-                     f" because kernel '{self.name}' uses {max(local_sizes.keys())}"
-                     " local hardware axes.")
-
-        # }}}
-
         return global_sizes, local_sizes
 
     @memoize_method
@@ -1181,8 +1028,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
                 callables_table, ignore_auto=ignore_auto,
                 return_dict=return_dict)
 
-    def get_grid_size_upper_bounds_as_exprs(self, callables_table,
-            ignore_auto=False, return_dict=False):
+    def get_grid_size_upper_bounds_as_exprs(
+            self, callables_table,
+            ignore_auto=False, return_dict=False
+            ) -> Tuple[Tuple[ExpressionT, ...], Tuple[ExpressionT, ...]]:
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1298,8 +1147,9 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
             lines.extend(sep)
             if show_labels:
                 lines.append("ARGUMENTS:")
-            for arg_name in natsorted(kernel.arg_dict):
-                lines.append(str(kernel.arg_dict[arg_name]))
+            # Arguments are ordered, do not be tempted to sort them.
+            for arg in kernel.args:
+                lines.append(str(arg))
 
         if "domains" in what:
             lines.extend(sep)
@@ -1378,36 +1228,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
     # }}}
 
-    # {{{ implementation arguments
-
-    @property
-    @memoize_method
-    def impl_arg_to_arg(self):
-        from loopy.kernel.array import ArrayBase
-
-        result = {}
-
-        for arg in self.args:
-            if not isinstance(arg, ArrayBase):
-                result[arg.name] = arg
-                continue
-
-            if arg.shape is None or arg.dim_tags is None:
-                result[arg.name] = arg
-                continue
-
-            subscripts_and_names = arg.subscripts_and_names()
-            if subscripts_and_names is None:
-                result[arg.name] = arg
-                continue
-
-            for _index, sub_arg_name in subscripts_and_names:
-                result[sub_arg_name] = arg
-
-        return result
-
-    # }}}
-
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
@@ -1426,11 +1246,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
     def __getstate__(self):
         result = {
-                key: getattr(self, key)
-                for key in self.__class__.fields
-                if hasattr(self, key)}
-
-        result.pop("cache_manager", None)
+                fld.name: getattr(self, fld.name)
+                for fld in fields(self.__class__)
+                if hasattr(self, fld.name)
+                and not fld.name.startswith("_")}
 
         # Make the instructions lazily unpickling, to support faster
         # cache retrieval for execution.
@@ -1452,71 +1271,75 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         from loopy.tools import LoopyKeyBuilder
         LoopyKeyBuilder()(self)
 
+        # pylint: disable=no-member
         return (result, self._pytools_persistent_hash_digest)
 
     def __setstate__(self, state):
         attribs, p_hash_digest = state
 
-        new_fields = set()
-
-        for k, v in attribs.items():
-            setattr(self, k, v)
-            new_fields.add(k)
-
-        self.register_fields(new_fields)
+        for name, val in attribs.items():
+            object.__setattr__(self, name, val)
 
         if 0:
             # {{{ check that 'reconstituted' object has same hash
 
             from loopy.tools import LoopyKeyBuilder
-            assert p_hash_digest == LoopyKeyBuilder()(self)
+            hash_before = LoopyKeyBuilder()(self)
 
-            # }}}
+            object.__setattr__(
+                    self, "_pytools_persistent_hash_digest", p_hash_digest)
 
-        self._pytools_persistent_hash_digest = p_hash_digest
+            assert hash_before == LoopyKeyBuilder()(self)
+
+            # }}}
+        else:
+            object.__setattr__(
+                    self, "_pytools_persistent_hash_digest", p_hash_digest)
 
         from loopy.kernel.tools import SetOperationCacheManager
-        self.cache_manager = SetOperationCacheManager()
-        self._kernel_executor_cache = {}
+        object.__setattr__(self, "_cache_manager", SetOperationCacheManager())
 
     # }}}
 
     # {{{ persistent hash key generation / comparison
 
-    hash_fields = (
+    hash_fields = [
             "domains",
             "instructions",
             "args",
-            "linearization",
-            "name",
-            "preambles",
             "assumptions",
-            "local_sizes",
             "temporary_variables",
             "inames",
             "substitutions",
-            "iname_slab_increments",
-            "loop_priority",
-            "silenced_warnings",
             "options",
-            "state",
             "target",
-            )
+            "tags",
+            "state",
+            "name",
 
-    comparison_fields = hash_fields + (
-            # Contains pymbolic expressions, hence a (small) headache to hash.
-            # Likely not needed for hash uniqueness => headache avoided.
-            "applied_iname_rewrites",
+            "preambles",
+            # preamble_generators
+            # symbol_manglers
+            "linearization",
+            "iname_slab_increments",
+            "loop_priority",
+            # applied_iname_rewrites
+            "index_dtype",
+            "silenced_warnings",
 
-            # These are lists of functions. It's not clear how to
-            # hash these correctly, so let's not attempt it. We'll
-            # just assume that the rest of the hash is specific enough
-            # that we won't have to rely on differences in these to
-            # resolve hash conflicts.
+            # missing:
+            # - applied_iname_rewrites
+            #   Contains pymbolic expressions, hence a (small) headache to hash.
+            #   Likely not needed for hash uniqueness => headache avoided.
 
-            "preamble_generators",
-            "symbol_manglers",
-            )
+            # - preamble_generators
+            # - symbol_manglers
+            #   These are lists of functions. It's not clear how to
+            #   hash these correctly, so let's not attempt it. We'll
+            #   just assume that the rest of the hash is specific enough
+            #   that we won't have to rely on differences in these to
+            #   resolve hash conflicts.
+            ]
 
     update_persistent_hash = update_persistent_hash
 
@@ -1528,56 +1351,9 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
         self.update_persistent_hash(key_hash, LoopyKeyBuilder())
         return hash(key_hash.digest())
 
-    def __eq__(self, other):
-        if self is other:
-            return True
-
-        if not isinstance(other, LoopKernel):
-            return False
-
-        for field_name in self.comparison_fields:
-            if field_name == "domains":
-                if len(self.domains) != len(other.domains):
-                    return False
-
-                for set_a, set_b in zip(self.domains, other.domains):
-                    if not (set_a.plain_is_equal(set_b) or set_a.is_equal(set_b)):
-                        return False
-
-            elif field_name == "assumptions":
-                if not (
-                        self.assumptions.plain_is_equal(other.assumptions)
-                        or self.assumptions.is_equal(other.assumptions)):
-                    return False
-
-            elif getattr(self, field_name) != getattr(other, field_name):
-                return False
-
-        return True
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
     # }}}
 
-    def get_copy_kwargs(self, **kwargs):
-        if "iname_to_tags" in kwargs:
-            if "inames" in kwargs:
-                raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to "
-                        "LoopKernel.get_copy_kwargs")
-
-            warn("Providing iname_to_tags is deprecated, pass inames instead. "
-                    "Will be unsupported in 2022.",
-                    DeprecationWarning, stacklevel=2)
-
-            iname_to_tags = kwargs["iname_to_tags"]
-            domains = kwargs.get("domains", self.domains)
-            kwargs["inames"] = {name: Iname(name,
-                                            iname_to_tags.get(name, frozenset()))
-                                for name in _get_inames_from_domains(domains)
-                                }
-            del kwargs["iname_to_tags"]
-
+    def get_copy_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
         if "domains" in kwargs:
             inames = kwargs.get("inames", self.inames)
             domains = kwargs["domains"]
@@ -1586,35 +1362,37 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
 
             assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)
 
-        if "instructions" in kwargs:
-            # Avoid carrying over an invalid cache when instructions are
-            # modified.
-            kwargs["_cached_written_variables"] = None
+        return kwargs
 
-        return super().get_copy_kwargs(**kwargs)
+    def copy(self, **kwargs: Any) -> "LoopKernel":
+        result = replace(self, **self.get_copy_kwargs(**kwargs))
 
-    def copy(self, **kwargs):
-        if "iname_to_tags" in kwargs:
-            if "inames" in kwargs:
-                raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to "
-                        "LoopKernel.copy")
+        object.__setattr__(result, "_cache_manager", self.cache_manager)
 
-        if "schedule" in kwargs:
-            if "linearization" in kwargs:
-                raise LoopyError("Cannot pass both `schedule` and "
-                                 "`linearization` to LoopKernel.copy")
-
-            kwargs["linearization"] = None
+        if "instructions" not in kwargs:
+            # Avoid carrying over an invalid cache when instructions are
+            # modified.
+            try:
+                # The type system does not know about this attribute, and we're
+                # not about to tell it. It's an internal caching hack.
+                cwv = self._cached_written_variables  # type: ignore[attr-defined]
+            except AttributeError:
+                pass
+            else:
+                object.__setattr__(result, "_cached_written_variables", cwv)
 
-        from pytools.tag import normalize_tags, check_tag_uniqueness
-        tags = kwargs.pop("tags", _not_provided)
-        if tags is not _not_provided:
-            kwargs["tags"] = check_tag_uniqueness(normalize_tags(tags))
+        return result
 
-        return super().copy(**kwargs)
+    def _with_new_tags(self, tags) -> "LoopKernel":
+        return replace(self, tags=tags)
 
-    def _with_new_tags(self, tags):
-        return self.copy(tags=tags)
+    @memoize_method
+    def _separation_info(self) -> Dict[str, _ArraySeparationInfo]:
+        return {
+                arg.name: arg._separation_info
+                for arg in self.args
+                if isinstance(arg, ArrayArg) and arg._separation_info is not None
+                }
 
 # }}}
 
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index d41a18152f0b3c35da1704e0071a6f8f9af7609c..3c444931aa47870763682ea59340070437fd0e78 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1,5 +1,6 @@
 """Implementation tagging of array axes."""
 
+from __future__ import annotations
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -23,16 +24,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import sys
+from typing import (cast, Optional, Tuple, Union, FrozenSet, Type, Sequence,
+        List, Callable, ClassVar, TypeVar, TYPE_CHECKING)
+from dataclasses import dataclass
 import re
 from warnings import warn
 
-from pytools import ImmutableRecord, memoize_method
-from pytools.tag import Taggable
+from pytools import ImmutableRecord
+from pytools.tag import Taggable, Tag
 
 import numpy as np  # noqa
 
 from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
+from loopy.typing import ExpressionT, ShapeType
+from loopy.types import LoopyType
+
+if TYPE_CHECKING:
+    from loopy.target import TargetBase
+    from loopy.kernel import LoopKernel
+    from loopy.kernel.data import auto, TemporaryVariable, ArrayArg
+    from loopy.codegen import VectorizationInfo
+
+if getattr(sys, "_BUILDING_SPHINX_DOCS", False):
+    from loopy.target import TargetBase  # noqa: F811
+
+
+T = TypeVar("T")
 
 
 __doc__ = """
@@ -680,11 +699,19 @@ class ArrayBase(ImmutableRecord, Taggable):
 
     (supports persistent hashing)
     """
+    name: str
+    dtype: LoopyType
+    shape: Union[ShapeType, Type["auto"], None]
+    dim_tags: Optional[Sequence[ArrayDimImplementationTag]]
+    offset: Union[ExpressionT, str, None]
+    dim_names: Optional[Tuple[str, ...]]
+    alignment: Optional[int]
+    tags: FrozenSet[Tag]
 
     # Note that order may also wind up in attributes, if the
     # number of dimensions has not yet been determined.
 
-    allowed_extra_kwargs = []
+    allowed_extra_kwargs: ClassVar[Tuple[str, ...]] = ()
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
             dim_names=None, strides=None, order=None, for_atomic=False,
@@ -949,7 +976,7 @@ class ArrayBase(ImmutableRecord, Taggable):
                         for i in self.dim_tags))
 
         if self.offset:
-            info_entries.append("offset: %s" % self.offset)
+            info_entries.append(f"offset: {self.offset}")
 
         if self.tags:
             info_entries.append(
@@ -1042,18 +1069,27 @@ class ArrayBase(ImmutableRecord, Taggable):
         else:
             return self
 
-    def vector_size(self, target):
+    def vector_size(self, target: TargetBase) -> int:
         """Return the size of the vector type used for the array
         divided by the basic data type.
 
         Note: For 3-vectors, this will be 4.
         """
 
-        if self.dim_tags is None:
+        if self.dim_tags is None or self.shape is None:
             return 1
 
+        assert isinstance(self.shape, tuple)
+
+        saw_vec_tag = False
+
         for i, dim_tag in enumerate(self.dim_tags):
             if isinstance(dim_tag, VectorArrayDimTag):
+                if saw_vec_tag:
+                    raise LoopyError("more than one axis of '{self.name}' "
+                            "is tagged 'vec'")
+                saw_vec_tag = True
+
                 shape_i = self.shape[i]
                 if not is_integer(shape_i):
                     raise LoopyError("shape of '%s' has non-constant-integer "
@@ -1066,222 +1102,51 @@ class ArrayBase(ImmutableRecord, Taggable):
 
         return 1
 
-    def decl_info(self, target, is_written, index_dtype, shape_override=None):
-        """Return a list of :class:`loopy.codegen.ImplementedDataInfo`
-        instances corresponding to the array.
-        """
 
-        array_shape = self.shape
-        if shape_override is not None:
-            array_shape = shape_override
-
-        from loopy.codegen import ImplementedDataInfo
-        from loopy.kernel.data import ValueArg
-
-        def gen_decls(name_suffix,
-                shape, strides,
-                unvec_shape, unvec_strides,
-                stride_arg_axes,
-                dtype, user_index):
-            """
-            :arg unvec_shape: shape tuple
-                that accounts for :class:`loopy.kernel.array.VectorArrayDimTag`
-                in a scalar manner
-            :arg unvec_strides: strides tuple
-                that accounts for :class:`loopy.kernel.array.VectorArrayDimTag`
-                in a scalar manner
-            :arg stride_arg_axes: a tuple *(user_axis, impl_axis, unvec_impl_axis)*
-            :arg user_index: A tuple representing a (user-facing)
-                multi-dimensional subscript. This is filled in with
-                concrete integers when known (such as for separate-array
-                dim tags), and with *None* where the index won't be
-                known until run time.
-            """
-
-            if dtype is None:
-                dtype = self.dtype
-
-            user_axis = len(user_index)
-
-            num_user_axes = self.num_user_axes(require_answer=False)
-
-            if num_user_axes is None or user_axis >= num_user_axes:
-                # {{{ recursion base case
-
-                full_name = self.name + name_suffix
-
-                stride_args = []
-                strides = list(strides)
-                unvec_strides = list(unvec_strides)
-
-                # generate stride arguments, yielded later to keep array first
-                for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \
-                        in stride_arg_axes:
-                    stride_name = full_name+"_stride%d" % stride_user_axis
-
-                    from pymbolic import var
-                    strides[stride_impl_axis] = \
-                            unvec_strides[stride_unvec_impl_axis] = \
-                            var(stride_name)
-
-                    stride_args.append(
-                            ImplementedDataInfo(
-                                target=target,
-                                name=stride_name,
-                                dtype=index_dtype,
-                                arg_class=ValueArg,
-                                stride_for_name_and_axis=(
-                                    full_name, stride_impl_axis),
-                                is_written=False))
-
-                yield ImplementedDataInfo(
-                            target=target,
-                            name=full_name,
-                            base_name=self.name,
-
-                            arg_class=type(self),
-                            dtype=dtype,
-                            shape=shape,
-                            strides=tuple(strides),
-                            unvec_shape=unvec_shape,
-                            unvec_strides=tuple(unvec_strides),
-                            allows_offset=bool(self.offset),
-
-                            is_written=is_written)
-
-                import loopy as lp
-
-                if self.offset is lp.auto:
-                    offset_name = full_name+"_offset"
-                    yield ImplementedDataInfo(
-                                target=target,
-                                name=offset_name,
-                                dtype=index_dtype,
-                                arg_class=ValueArg,
-                                offset_for_name=full_name,
-                                is_written=False)
-
-                yield from stride_args
-
-                # }}}
-
-                return
-
-            dim_tag = self.dim_tags[user_axis]
-
-            if isinstance(dim_tag, FixedStrideArrayDimTag):
-                if array_shape is None:
-                    new_shape_axis = None
-                else:
-                    new_shape_axis = array_shape[user_axis]
-
-                import loopy as lp
-                if dim_tag.stride is lp.auto:
-                    new_stride_arg_axes = stride_arg_axes \
-                            + ((user_axis, len(strides), len(unvec_strides)),)
-
-                    # repaired above when final array name is known
-                    # (and stride argument is created)
-                    new_stride_axis = None
-                else:
-                    new_stride_arg_axes = stride_arg_axes
-                    new_stride_axis = dim_tag.stride
-
-                yield from gen_decls(name_suffix,
-                        shape + (new_shape_axis,), strides + (new_stride_axis,),
-                        unvec_shape + (new_shape_axis,),
-                        unvec_strides + (new_stride_axis,),
-                        new_stride_arg_axes,
-                        dtype, user_index + (None,))
-
-            elif isinstance(dim_tag, SeparateArrayArrayDimTag):
-                shape_i = array_shape[user_axis]
-                if not is_integer(shape_i):
-                    raise LoopyError("shape of '%s' has non-constant "
-                            "integer axis %d (0-based)" % (
-                                self.name, user_axis))
-
-                for i in range(shape_i):
-                    yield from gen_decls(name_suffix + "_s%d" % i,
-                            shape, strides, unvec_shape, unvec_strides,
-                            stride_arg_axes, dtype,
-                            user_index + (i,))
-
-            elif isinstance(dim_tag, VectorArrayDimTag):
-                shape_i = array_shape[user_axis]
-                if not is_integer(shape_i):
-                    raise LoopyError("shape of '%s' has non-constant "
-                            "integer axis %d (0-based)" % (
-                                self.name, user_axis))
-
-                yield from gen_decls(name_suffix,
-                        shape, strides,
-                        unvec_shape + (shape_i,),
-                        # vectors always have stride 1
-                        unvec_strides + (1,),
-                        stride_arg_axes,
-                        target.vector_dtype(dtype, shape_i),
-                        user_index + (None,))
-
-            else:
-                raise LoopyError("unsupported array dim implementation tag '%s' "
-                        "in array '%s'" % (dim_tag, self.name))
-
-        yield from gen_decls(name_suffix="",
-                shape=(), strides=(),
-                unvec_shape=(), unvec_strides=(),
-                stride_arg_axes=(),
-                dtype=self.dtype, user_index=())
-
-    @memoize_method
-    def sep_shape(self):
-        sep_shape = []
-        for shape_i, dim_tag in zip(self.shape, self.dim_tags):
-            if isinstance(dim_tag, SeparateArrayArrayDimTag):
-                if not is_integer(shape_i):
-                    raise TypeError("array '%s' has non-fixed-size "
-                            "separate-array axis" % self.name)
+# }}}
 
-                sep_shape.append(shape_i)
+def drop_vec_dims(
+        dim_tags: Tuple[ArrayDimImplementationTag, ...],
+        t: Tuple[T, ...]) -> Tuple[T, ...]:
+    assert len(dim_tags) == len(t)
+    return tuple(t_i for dim_tag, t_i in zip(dim_tags, t)
+            if not isinstance(dim_tag, VectorArrayDimTag))
 
-        return tuple(sep_shape)
 
-    @memoize_method
-    def subscripts_and_names(self):
-        sep_shape = self.sep_shape()
+def get_strides(array: ArrayBase) -> Tuple[ExpressionT, ...]:
+    from pymbolic import var
+    result: List[ExpressionT] = []
 
-        if not sep_shape:
-            return None
+    if array.dim_tags is None:
+        return ()
 
-        def unwrap_1d_indices(idx):
-            # This allows these indices to work on Python sequences, too, not
-            # just numpy arrays.
+    for dim_tag in array.dim_tags:
+        if isinstance(dim_tag, VectorArrayDimTag):
+            result.append(1)
 
-            if len(idx) == 1:
-                return idx[0]
+        elif isinstance(dim_tag, FixedStrideArrayDimTag):
+            if isinstance(dim_tag.stride, str):
+                result.append(var(dim_tag.stride))
             else:
-                return idx
+                result.append(dim_tag.stride)
 
-        return [
-                (unwrap_1d_indices(i),
-                    self.name + "".join("_s%d" % sub_i for sub_i in i))
-                for i in np.ndindex(sep_shape)]
+        else:
+            raise ValueError("unexpected dim tag type during stride finding: "
+                    f"'{type(dim_tag)}'")
 
-# }}}
+    return tuple(result)
 
 
 # {{{ access code generation
 
+@dataclass(frozen=True)
 class AccessInfo(ImmutableRecord):
-    """
-    .. attribute:: array_name
-    .. attribute:: vector_index
-    .. attribute:: subscripts
-        List of expressions, one for each target axis
-    """
+    array_name: str
+    vector_index: Optional[int]
+    subscripts: Tuple[ExpressionT, ...]
 
 
-def _apply_offset(sub, array_name, ary):
+def _apply_offset(sub: ExpressionT, ary: ArrayBase) -> ExpressionT:
     """
     Helper for :func:`get_access_info`.
     Augments *ary*'s subscript index expression (*sub*) with its offset info.
@@ -1293,18 +1158,33 @@ def _apply_offset(sub, array_name, ary):
     from pymbolic import var
 
     if ary.offset:
+        from loopy.kernel.data import TemporaryVariable
+        if isinstance(ary, TemporaryVariable):
+            # offsets for base_storage are added when the temporary
+            # is declared.
+            return sub
+
         if ary.offset is lp.auto:
-            return var(array_name+"_offset") + sub
+            raise AssertionError(
+                    f"Offset for '{ary.name}' should have been replaced "
+                    "with an actual argument by "
+                    "make_temporaries_for_offsets_and_strides "
+                    "during preprocessing.")
         elif isinstance(ary.offset, str):
             return var(ary.offset) + sub
         else:
             # assume it's an expression
-            return ary.offset + sub
+            # FIXME: mypy can't figure out that ExpressionT + ExpressionT works
+            return ary.offset + sub  # type: ignore[call-overload, arg-type, operator]  # noqa: E501
     else:
         return sub
 
 
-def get_access_info(target, ary, index, eval_expr, vectorization_info):
+def get_access_info(kernel: "LoopKernel",
+        ary: Union["ArrayArg", "TemporaryVariable"],
+        index: Union[ExpressionT, Tuple[ExpressionT, ...]],
+        eval_expr: Callable[[ExpressionT], int],
+        vectorization_info: "VectorizationInfo") -> AccessInfo:
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
@@ -1313,7 +1193,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
     """
 
     import loopy as lp
-    from pymbolic import var
 
     def eval_expr_assert_integer_constant(i, expr):
         from pymbolic.mapper.evaluator import UnknownVariableError
@@ -1336,8 +1215,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
     if not isinstance(index, tuple):
         index = (index,)
 
-    array_name = ary.name
-
     if ary.dim_tags is None:
         if len(index) != 1:
             raise LoopyError("Array '%s' has no known axis implementation "
@@ -1347,8 +1224,8 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                     % ary.name)
 
         return AccessInfo(
-                array_name=array_name,
-                subscripts=(_apply_offset(index[0], array_name, ary),),
+                array_name=ary.name,
+                subscripts=(_apply_offset(index[0], ary),),
                 vector_index=None)
 
     if len(ary.dim_tags) != len(index):
@@ -1359,21 +1236,33 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
     num_target_axes = ary.num_target_axes()
 
     vector_index = None
-    subscripts = [0] * num_target_axes
+    subscripts: List[ExpressionT] = [0] * num_target_axes
 
-    vector_size = ary.vector_size(target)
+    vector_size = ary.vector_size(kernel.target)
 
     # {{{ process separate-array dim tags first, to find array name
 
-    for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
-        if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_integer_constant(i, idx)
-            array_name += "_s%d" % idx
+    from loopy.kernel.data import ArrayArg
+    if isinstance(ary, ArrayArg) and ary._separation_info:
+        sep_index = []
+        remaining_index = []
+        for iaxis, (index_i, dim_tag) in enumerate(zip(index, ary.dim_tags)):
+            if iaxis in ary._separation_info.sep_axis_indices_set:
+                sep_index.append(eval_expr_assert_integer_constant(iaxis, index_i))
+                assert isinstance(dim_tag, SeparateArrayArrayDimTag)
+            else:
+                remaining_index.append(index_i)
+
+        index = tuple(remaining_index)
+        # only arguments (not temporaries) may be sep-tagged
+        ary = cast(ArrayArg,
+            kernel.arg_dict[ary._separation_info.subarray_names[tuple(sep_index)]])
 
     # }}}
 
     # {{{ process remaining dim tags
 
+    assert ary.dim_tags is not None
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, FixedStrideArrayDimTag):
             stride = dim_tag.stride
@@ -1386,18 +1275,24 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                             % (ary.name, i, dim_tag.stride, vector_size))
 
             elif stride is lp.auto:
-                stride = var(array_name + "_stride%d" % i)
+                raise AssertionError(
+                        f"Stride for axis {i+1} (1-based) of "
+                        "'{array_name}' should have been replaced "
+                        "with an actual argument by "
+                        "make_temporaries_for_offsets_and_strides "
+                        "during preprocessing.")
 
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
         elif isinstance(dim_tag, SeparateArrayArrayDimTag):
-            pass
+            raise AssertionError()
 
         elif isinstance(dim_tag, VectorArrayDimTag):
             from pymbolic.primitives import Variable
+            index_i = index[i]
             if (vectorization_info is not None
-                    and isinstance(index[i], Variable)
-                    and index[i].name == vectorization_info.iname):
+                    and isinstance(index_i, Variable)
+                    and index_i.name == vectorization_info.iname):
                 # We'll do absolutely nothing here, which will result
                 # in the vector being returned.
                 pass
@@ -1414,18 +1309,17 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
     # }}}
 
-    from pymbolic import var
     import loopy as lp
     if ary.offset:
         if num_target_axes > 1:
             raise NotImplementedError("offsets for multiple image axes")
 
-        subscripts[0] = _apply_offset(subscripts[0], array_name, ary)
+        subscripts[0] = _apply_offset(subscripts[0], ary)
 
     return AccessInfo(
-            array_name=array_name,
+            array_name=ary.name,
             vector_index=vector_index,
-            subscripts=subscripts)
+            subscripts=tuple(subscripts))
 
 # }}}
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index cccfee1228403fa82bd6fcb83fb059084dbbf35c..e61063246d1d66eb47217f2f9391a911fc42e2cd 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2447,6 +2447,25 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
     from pytools.tag import normalize_tags, check_tag_uniqueness
     tags = check_tag_uniqueness(normalize_tags(kwargs.pop("tags", frozenset())))
 
+    index_dtype = kwargs.pop("index_dtype", None)
+    if index_dtype is None:
+        index_dtype = np.int32
+
+    from loopy.types import to_loopy_type
+    index_dtype = to_loopy_type(index_dtype)
+
+    preambles = kwargs.pop("preambles", None)
+    if preambles is None:
+        preambles = ()
+    elif not isinstance(preambles, tuple):
+        preambles = tuple(preambles)
+
+    preamble_generators = kwargs.pop("preamble_generators", None)
+    if preamble_generators is None:
+        preamble_generators = ()
+    elif not isinstance(preamble_generators, tuple):
+        preamble_generators = tuple(preamble_generators)
+
     from loopy.kernel import LoopKernel
     knl = LoopKernel(domains, instructions, kernel_args,
             temporary_variables=temporary_variables,
@@ -2456,6 +2475,9 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
             tags=tags,
             inames=inames,
             assumptions=assumptions,
+            index_dtype=index_dtype,
+            preambles=preambles,
+            preamble_generators=preamble_generators,
             **kwargs)
 
     from loopy.transform.instruction import uniquify_instruction_ids
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 8024d8ddd917407e6ab16ffd984df2dc5bc44c69..f0e4c823a8a8c0aceef9e497067ada4396753998 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -24,13 +24,23 @@ THE SOFTWARE.
 """
 
 
+from typing import (Type, Union, FrozenSet, Tuple, Optional, Sequence, Any, ClassVar,
+        cast)
 from sys import intern
+from dataclasses import dataclass, replace
+from enum import IntEnum
+from warnings import warn
+
+from immutables import Map
 import numpy as np  # noqa
 from pytools import ImmutableRecord
 from pytools.tag import Taggable
-from pytools.tag import UniqueTag as UniqueTagBase
-from loopy.kernel.array import ArrayBase
+from pytools.tag import UniqueTag as UniqueTagBase, Tag
+
+from loopy.kernel.array import ArrayBase, ArrayDimImplementationTag
 from loopy.diagnostic import LoopyError
+from loopy.typing import ExpressionT
+from loopy.types import LoopyType
 from loopy.kernel.instruction import (  # noqa
         InstructionBase,
         MemoryOrdering,
@@ -44,6 +54,7 @@ from loopy.kernel.instruction import (  # noqa
         make_assignment,
         CInstruction)
 
+
 __doc__ = """
 .. autofunction:: filter_iname_tags_by_type
 
@@ -64,10 +75,47 @@ __doc__ = """
 .. autoclass:: UnrollTag
 
 .. autoclass:: Iname
-
-.. autoclass:: KernelArgument
 """
 
+# This docstring is included in ref_internals. Do not include parts of the public
+# interface, e.g. TemporaryVariable, KernelArgument, ArrayArg.
+
+
+# {{{ utilities
+
+def _names_from_expr(expr: Union[None, ExpressionT, str]) -> FrozenSet[str]:
+    from numbers import Number
+    from loopy.symbolic import DependencyMapper
+    dep_mapper = DependencyMapper()
+
+    from pymbolic.primitives import Expression
+    if isinstance(expr, str):
+        return frozenset({expr})
+    elif isinstance(expr, Expression):
+        return frozenset(v.name for v in dep_mapper(expr))
+    elif expr is None:
+        return frozenset()
+    elif isinstance(expr, Number):
+        return frozenset()
+    else:
+        raise ValueError(f"unexpected value of expression-like object: '{expr}'")
+
+
+def _names_from_dim_tags(
+        dim_tags: Optional[Sequence[ArrayDimImplementationTag]]) -> FrozenSet[str]:
+    from loopy.kernel.array import FixedStrideArrayDimTag
+    if dim_tags is not None:
+        return frozenset({
+            name
+            for dim_tag in dim_tags
+            if isinstance(dim_tag, FixedStrideArrayDimTag)
+            for name in _names_from_expr(dim_tag.stride)
+            })
+    else:
+        return frozenset()
+
+# }}}
+
 
 class auto:  # noqa
     """A generic placeholder object for something that should be automatically
@@ -109,7 +157,7 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
 
 
 class InameImplementationTag(ImmutableRecord, UniqueTagBase):
-    __slots__ = []
+    __slots__: ClassVar[Tuple[str, ...]] = ()
 
     def __hash__(self):
         return hash(self.key)
@@ -262,7 +310,7 @@ def parse_tag(tag):
 
 # {{{ memory address space
 
-class AddressSpace:
+class AddressSpace(IntEnum):
     """Storage location of a variable.
 
     .. attribute:: PRIVATE
@@ -278,7 +326,7 @@ class AddressSpace:
     GLOBAL = 2
 
     @classmethod
-    def stringify(cls, val):
+    def stringify(cls, val: Union["AddressSpace", Type[auto]]) -> str:
         if val == cls.PRIVATE:
             return "private"
         elif val == cls.LOCAL:
@@ -296,24 +344,36 @@ class AddressSpace:
 # {{{ arguments
 
 class KernelArgument(ImmutableRecord):
-    """Base class for all argument types"""
+    """Base class for all argument types.
+
+    .. attribute:: name
+    .. attribute:: dtype
+    .. attribute:: is_output
+    .. attribute:: is_input
+
+    .. automethod:: supporting_names
+    """
+    name: str
+    dtype: LoopyType
+    is_output: bool
+    is_input: bool
 
     def __init__(self, **kwargs):
         kwargs["name"] = intern(kwargs.pop("name"))
 
         target = kwargs.pop("target", None)
+        if target is not None:
+            warn("Passing 'target' is deprecated and will stop working in 2023. "
+                    "It is already being ignored.",
+                    DeprecationWarning, stacklevel=2)
 
         dtype = kwargs.pop("dtype", None)
 
-        if "for_atomic" in kwargs:
-            for_atomic = kwargs["for_atomic"]
-        else:
-            for_atomic = False
+        for_atomic = kwargs.pop("for_atomic", False)
 
         from loopy.types import to_loopy_type
         dtype = to_loopy_type(
-                dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic,
-                target=target)
+                dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic)
 
         import loopy as lp
         if dtype is lp.auto:
@@ -325,9 +385,27 @@ class KernelArgument(ImmutableRecord):
 
         ImmutableRecord.__init__(self, **kwargs)
 
+    def supporting_names(self) -> FrozenSet[str]:
+        """'Supporting' names are those that are likely to be required to be
+        present for any use of the argument.
+        """
+
+        return frozenset()
+
+
+@dataclass(frozen=True)
+class _ArraySeparationInfo:
+    """Not user-facing. If an array has been split because an axis
+    is tagged with :class:`~loopy.kernel.data.SeparateArrayArrayDimTag`,
+    this records the names of the actually present sub-arrays that
+    should be used to realize this array.
+    """
+    sep_axis_indices_set: FrozenSet[int]
+    subarray_names: Map[Tuple[int, ...], str]
+
 
 class ArrayArg(ArrayBase, KernelArgument):
-    __doc__ = ArrayBase.__doc__ + (
+    __doc__ = cast(str, ArrayBase.__doc__) + (
         """
         .. attribute:: address_space
 
@@ -347,11 +425,17 @@ class ArrayArg(ArrayBase, KernelArgument):
             at kernel entry.
         """)
 
-    allowed_extra_kwargs = [
+    address_space: AddressSpace
+
+    # _separation_info is not user-facing and hence not documented.
+    _separation_info: Optional[_ArraySeparationInfo]
+
+    allowed_extra_kwargs = (
             "address_space",
             "is_output",
             "is_input",
-            "tags"]
+            "tags",
+            "_separation_info")
 
     def __init__(self, *args, **kwargs):
         if "address_space" not in kwargs:
@@ -359,29 +443,27 @@ class ArrayArg(ArrayBase, KernelArgument):
 
         kwargs["is_output"] = kwargs.pop("is_output", None)
         kwargs["is_input"] = kwargs.pop("is_input", None)
+        kwargs["_separation_info"] = kwargs.pop("_separation_info", None)
 
         super().__init__(*args, **kwargs)
 
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
-        return ast_builder.get_array_arg_decl(self.name + name_suffix,
-                self.address_space, shape, dtype, is_written)
-
     def __str__(self):
-        # dont mention the type name if shape is known
+        # Don't mention the type of array arg if shape is known
+        # FIXME: Why?
         include_typename = self.shape in (None, auto)
 
         aspace_str = AddressSpace.stringify(self.address_space)
 
-        assert self.is_input or self.is_output
-
         inout = []
         if self.is_input:
             inout.append("in")
         if self.is_output:
             inout.append("out")
+        if not (self.is_input or self.is_output):
+            inout.append("neither_in_nor_out?")
 
         return (
                 self.stringify(include_typename=include_typename)
@@ -396,6 +478,15 @@ class ArrayArg(ArrayBase, KernelArgument):
         key_builder.rec(key_hash, self.address_space)
         key_builder.rec(key_hash, self.is_output)
         key_builder.rec(key_hash, self.is_input)
+        key_builder.rec(key_hash, self._separation_info)
+
+    def supporting_names(self) -> FrozenSet[str]:
+        # Do not consider separation info here: The subarrays don't support, they
+        # replace this array.
+        return (
+                _names_from_expr(self.offset)
+                | _names_from_dim_tags(self.dim_tags)
+                )
 
 
 # Making this a function prevents incorrect use in isinstance.
@@ -426,10 +517,6 @@ class ConstantArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
-        return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape,
-                dtype, is_written)
-
 
 class ImageArg(ArrayBase, KernelArgument):
     __doc__ = ArrayBase.__doc__
@@ -449,12 +536,19 @@ class ImageArg(ArrayBase, KernelArgument):
 
     @property
     def dimensions(self):
+        assert self.dim_tags is not None
         return len(self.dim_tags)
 
     def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
         return ast_builder.get_image_arg_decl(self.name + name_suffix, shape,
                 self.num_target_axes(), dtype, is_written)
 
+    def supporting_names(self) -> FrozenSet[str]:
+        return (
+                _names_from_expr(self.offset)
+                | _names_from_dim_tags(self.dim_tags)
+                )
+
 
 """
     :attribute tags: A (possibly empty) frozenset of instances of
@@ -511,18 +605,13 @@ class ValueArg(KernelArgument, Taggable):
         return ast_builder.get_value_arg_decl(self.name, (),
                 self.dtype, False)
 
-
-class InameArg(ValueArg):
-    pass
-
 # }}}
 
 
 # {{{ temporary variable
 
-
 class TemporaryVariable(ArrayBase):
-    __doc__ = ArrayBase.__doc__ + """
+    __doc__ = cast(str, ArrayBase.__doc__) + """
     .. attribute:: storage_shape
     .. attribute:: base_indices
     .. attribute:: address_space
@@ -535,8 +624,9 @@ class TemporaryVariable(ArrayBase):
     .. attribute:: base_storage
 
         The name of a storage array that is to be used to actually
-        hold the data in this temporary. Note that this storage
-        array must not match any existing variable names.
+        hold the data in this temporary, or *None*. If not *None* or the name
+        of an existing variable, a variable of this name and appropriate size
+        will be created.
 
     .. attribute:: initializer
 
@@ -560,7 +650,7 @@ class TemporaryVariable(ArrayBase):
     min_target_axes = 0
     max_target_axes = 1
 
-    allowed_extra_kwargs = [
+    allowed_extra_kwargs = (
             "storage_shape",
             "base_indices",
             "address_space",
@@ -568,7 +658,7 @@ class TemporaryVariable(ArrayBase):
             "initializer",
             "read_only",
             "_base_storage_access_may_be_aliasing",
-            ]
+            )
 
     def __init__(self, name, dtype=None, shape=auto, address_space=None,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
@@ -680,29 +770,20 @@ class TemporaryVariable(ArrayBase):
         from pytools import product
         return product(si for si in shape)*self.dtype.itemsize
 
-    def decl_info(self, target, index_dtype):
-        return super().decl_info(
-                target, is_written=True, index_dtype=index_dtype,
-                shape_override=self.storage_shape)
-
-    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
-        if self.address_space == AddressSpace.GLOBAL:
-            return ast_builder.get_array_arg_decl(self.name + name_suffix,
-                    AddressSpace.GLOBAL, shape, dtype, is_written)
-        else:
-            raise LoopyError("unexpected request for argument declaration of "
-                    "non-global temporary")
-
     def __str__(self):
         if self.address_space is auto:
             aspace_str = "auto"
         else:
             aspace_str = AddressSpace.stringify(self.address_space)
 
+        if self.base_storage is None:
+            bs_str = ""
+        else:
+            bs_str = " base_storage: "+str(self.base_storage)
+
         return (
                 self.stringify(include_typename=False)
-                +
-                " aspace:%s" % aspace_str)
+                + f" aspace: {aspace_str}{bs_str}")
 
     def __eq__(self, other):
         return (
@@ -739,6 +820,15 @@ class TemporaryVariable(ArrayBase):
         key_builder.rec(key_hash, self.read_only)
         key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
 
+    def supporting_names(self) -> FrozenSet[str]:
+        return (
+                _names_from_expr(self.offset)
+                | _names_from_dim_tags(self.dim_tags)
+                | (
+                    frozenset({self.base_storage})
+                    if self.base_storage else frozenset())
+                )
+
 # }}}
 
 
@@ -809,6 +899,7 @@ class CallMangleInfo(ImmutableRecord):
 
 # {{{ Iname class
 
+@dataclass(frozen=True)
 class Iname(Taggable):
     """
     Records an iname in a :class:`~loopy.LoopKernel`. See :ref:`domain-tree` for
@@ -824,41 +915,19 @@ class Iname(Taggable):
 
         An instance of :class:`str`, denoting the iname's name.
 
-    .. attribute:: tas
+    .. attribute:: tags
 
         An instance of :class:`frozenset` of :class:`pytools.tag.Tag`.
     """
-    def __init__(self, name, tags=frozenset()):
-        super().__init__(tags=tags)
-
-        assert isinstance(name, str)
-        self.name = name
+    name: str
+    tags: FrozenSet[Tag]
 
-    def copy(self, *, name=None, tags=None):
-        if name is None:
-            name = self.name
-        if tags is None:
-            tags = self.tags
-
-        return type(self)(name=name, tags=tags)
+    def copy(self, **kwargs: Any) -> "Iname":
+        return replace(self, **kwargs)
 
     def _with_new_tags(self, tags):
         return self.copy(tags=tags)
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-        """
-        key_builder.rec(key_hash, type(self).__name__.encode("utf-8"))
-        key_builder.rec(key_hash, self.name)
-        key_builder.rec(key_hash, self.tags)
-
-    def __eq__(self, other):
-        return (
-                type(self) == type(other)
-                and self.name == other.name
-                and self.tags == other.tags)
-
 # }}}
 
 
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 427bcb621951a49146367878dbdd515cb369ff83..eb373a12df8426ba32ca3e24d6a9cf53323d617a 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import ClassVar, Tuple
+
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
@@ -324,7 +326,8 @@ class InKernelCallable(ImmutableRecord):
 
     """
 
-    hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr")
+    hash_fields: ClassVar[Tuple[str, ...]] = (
+            "name", "arg_id_to_dtype", "arg_id_to_descr")
 
     def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None):
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 19af767ac28734fa193beee64d23cf688d45fd51..9a2437bf63feba1fd9114539248ecf889cae5ad2 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -21,12 +21,15 @@ THE SOFTWARE.
 """
 
 from sys import intern
+from functools import cached_property
+
+from warnings import warn
+import islpy as isl
 from pytools import ImmutableRecord, memoize_method
 from pytools.tag import Tag, tag_dataclass, Taggable
+
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional
-from warnings import warn
-import islpy as isl
 
 
 # {{{ instruction tags
@@ -412,8 +415,7 @@ class InstructionBase(ImmutableRecord, Taggable):
 
     # {{{ hashing and key building
 
-    @property
-    @memoize_method
+    @cached_property
     def _key_builder(self):
         from loopy.tools import LoopyEqKeyBuilder
         key_builder = LoopyEqKeyBuilder()
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ba2fb22562036154981b98dbda9c063381890a62..99f5f3503289a36a9c6fef17bba4f8f6ad29c892 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -23,6 +23,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import Sequence, Mapping, FrozenSet, Dict, AbstractSet, Set, List
+
 import sys
 
 from sys import intern
@@ -638,7 +640,8 @@ def show_dependency_graph(*args, **kwargs):
 
 # {{{ is domain dependent on inames
 
-def is_domain_dependent_on_inames(kernel, domain_index, inames):
+def is_domain_dependent_on_inames(kernel: LoopKernel,
+        domain_index: int, inames: AbstractSet[str]) -> bool:
     dom = kernel.domains[domain_index]
     dom_parameters = set(dom.get_var_names(dim_type.param))
 
@@ -1763,7 +1766,26 @@ def find_most_recent_global_barrier(kernel, insn_id):
 # {{{ subkernel tools
 
 @memoize_on_first_arg
-def get_subkernels(kernel):
+def get_subkernel_start_indices(kernel: LoopKernel) -> Mapping[str, int]:
+    """Return a sequence of schedule indices of
+
+    See also :class:`loopy.schedule.CallKernel`.
+    """
+    from loopy.kernel import KernelState
+    if kernel.state != KernelState.LINEARIZED:
+        raise LoopyError("Kernel must be linearized")
+
+    assert kernel.linearization is not None
+
+    from loopy.schedule import CallKernel
+
+    return {lin_item.kernel_name: i
+            for i, lin_item in enumerate(kernel.linearization)
+            if isinstance(lin_item, CallKernel)}
+
+
+@memoize_on_first_arg
+def get_subkernels(kernel) -> Sequence[str]:
     """Return a :class:`tuple` of the names of the subkernels in the kernel. The
     kernel must be scheduled.
 
@@ -1773,15 +1795,14 @@ def get_subkernels(kernel):
     if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("Kernel must be linearized")
 
-    from loopy.schedule import CallKernel
+    assert kernel.linearization is not None
 
-    return tuple(sched_item.kernel_name
-            for sched_item in kernel.linearization
-            if isinstance(sched_item, CallKernel))
+    return tuple(kernel.linearization[i].kernel_name
+            for i in get_subkernel_start_indices(kernel).values())
 
 
 @memoize_on_first_arg
-def get_subkernel_to_insn_id_map(kernel):
+def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]:
     """Return a :class:`dict` mapping subkernel names to a :class:`frozenset`
     consisting of the instruction ids scheduled within the subkernel. The
     kernel must be scheduled.
@@ -1790,26 +1811,52 @@ def get_subkernel_to_insn_id_map(kernel):
     if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("Kernel must be scheduled")
 
+    assert kernel.linearization is not None
+
     from loopy.schedule import (
             sched_item_to_insn_id, CallKernel, ReturnFromKernel)
 
     subkernel = None
-    result = {}
+    result: Dict[str, Set[str]] = {}
 
-    for sched_item in kernel.linearization:
-        if isinstance(sched_item, CallKernel):
-            subkernel = sched_item.kernel_name
+    for lin_item in kernel.linearization:
+        if isinstance(lin_item, CallKernel):
+            subkernel = lin_item.kernel_name
             result[subkernel] = set()
 
-        if isinstance(sched_item, ReturnFromKernel):
+        if isinstance(lin_item, ReturnFromKernel):
             subkernel = None
 
         if subkernel is not None:
-            for insn_id in sched_item_to_insn_id(sched_item):
+            for insn_id in sched_item_to_insn_id(lin_item):
                 result[subkernel].add(insn_id)
 
     return {name: frozenset(insn_ids) for name, insn_ids in result.items()}
 
+
+@memoize_on_first_arg
+def get_subkernel_extra_inames(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]:
+    from loopy.kernel import KernelState
+    if kernel.state != KernelState.LINEARIZED:
+        raise LoopyError("Kernel must be scheduled")
+
+    assert kernel.linearization is not None
+
+    result = {}
+    inames: List[str] = []
+
+    from loopy.schedule import CallKernel, EnterLoop, LeaveLoop
+
+    for lin_item in kernel.linearization:
+        if isinstance(lin_item, CallKernel):
+            result[lin_item.kernel_name] = frozenset(inames)
+        elif isinstance(lin_item, EnterLoop):
+            inames.append(lin_item.iname)
+        elif isinstance(lin_item, LeaveLoop):
+            inames.pop()
+
+    return result
+
 # }}}
 
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index d7558960ab0c7e2c4f045655a068fc67d0785797..9c465653f4465524dfee8bcd7861b7e6dd052d6d 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -70,7 +70,7 @@ class IndexOfCallable(ScalarCallable):
 
         from loopy.kernel.array import get_access_info
         from pymbolic import evaluate
-        access_info = get_access_info(expression_to_code_mapper.kernel.target,
+        access_info = get_access_info(expression_to_code_mapper.kernel,
                 ary, arg.index, lambda expr: evaluate(expr,
                     expression_to_code_mapper.codegen_state.var_subst_map),
                 expression_to_code_mapper.codegen_state.vectorization_info)
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index ac749f6eff8b7e12b2376e916066218f76e4ebd2..190e6bd9abcd9e07e54dc13b607820201aa9d1fa 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -21,11 +21,13 @@ THE SOFTWARE.
 """
 
 
+from typing import ClassVar, Tuple
+
 from pymbolic import var
-from loopy.symbolic import ResolvedFunction
-from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
+from loopy.symbolic import ResolvedFunction
+from loopy.kernel.function_interface import ScalarCallable
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
@@ -271,7 +273,7 @@ class MinReductionOperation(ScalarReductionOperation):
 # {{{ base class for symbolic reduction ops
 
 class ReductionOpFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op",)
+    init_arg_names: ClassVar[Tuple[str, ...]] = ("reduction_op",)
 
     def __init__(self, reduction_op):
         self.reduction_op = reduction_op
diff --git a/loopy/match.py b/loopy/match.py
index 1ec6b32b497672eacca354f0aab6bf1c45bbfb5c..624276dce486dad3e7f1b8f8af9a757262f62a2e 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -291,7 +291,7 @@ class Tagged(GlobMatchExpressionBase):
 class Writes(GlobMatchExpressionBase):
     def __call__(self, kernel, matchable):
         return any(self.re.match(name)
-                for name in matchable.write_dependency_names())
+                for name in matchable.assignee_var_names())
 
 
 class Reads(GlobMatchExpressionBase):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 199efdace5a976b79141634e8deef95222c0668e..e26431c93f144b5781cb53960be56142fb21a51a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -20,15 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import Tuple, TypeVar, Iterable, Optional, List, FrozenSet, cast
 import logging
 logger = logging.getLogger(__name__)
 
+from immutables import Map
+import numpy as np
+
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
 
 from loopy.tools import memoize_on_disk
-from loopy.kernel.data import filter_iname_tags_by_type
+from loopy.kernel.data import filter_iname_tags_by_type, ArrayArg, auto, ValueArg
+
+from loopy.kernel import LoopKernel
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
 from loopy.symbolic import RuleAwareIdentityMapper
@@ -37,6 +43,11 @@ from loopy.symbolic import RuleAwareIdentityMapper
 from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
         CallInstruction,  _DataObliviousInstruction)
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.transform.data import allocate_temporaries_for_base_storage
+from loopy.kernel.array import ArrayDimImplementationTag
+from loopy.kernel.data import _ArraySeparationInfo, KernelArgument
+from loopy.translation_unit import for_each_kernel
+from loopy.typing import ExpressionT
 
 from pytools import ProcessLogger
 from functools import partial
@@ -100,6 +111,195 @@ def check_reduction_iname_uniqueness(kernel):
 # }}}
 
 
+# {{{ make_arrays_for_sep_arrays
+
+T = TypeVar("T")
+
+
+def _remove_at_indices(
+        indices: FrozenSet[int], values: Optional[Iterable[T]]
+        ) -> Optional[Tuple[T, ...]]:
+    """
+    Assumes *indices* is sorted.
+    """
+    if values is None:
+        return values
+
+    return tuple(val for i, val in enumerate(values) if i not in indices)
+
+
+@for_each_kernel
+def make_arrays_for_sep_arrays(kernel: LoopKernel) -> LoopKernel:
+    from loopy.kernel.array import SeparateArrayArrayDimTag
+    new_args = []
+
+    vng = kernel.get_var_name_generator()
+    made_changes = False
+
+    # {{{ rewrite arguments
+
+    for arg in kernel.args:
+        if not isinstance(arg, ArrayArg) or arg.dim_tags is None:
+            new_args.append(arg)
+            continue
+
+        sep_axis_indices = [
+                i for i, dim_tag in enumerate(arg.dim_tags)
+                if isinstance(dim_tag, SeparateArrayArrayDimTag)]
+
+        if not sep_axis_indices or arg._separation_info:
+            new_args.append(arg)
+            continue
+
+        made_changes = True
+
+        sep_axis_indices_set = frozenset(sep_axis_indices)
+
+        assert isinstance(arg.shape, tuple)
+        new_shape: Optional[Tuple[ExpressionT, ...]] = \
+                _remove_at_indices(sep_axis_indices_set, arg.shape)
+        new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \
+                _remove_at_indices(sep_axis_indices_set, arg.dim_tags)
+        new_dim_names: Optional[Tuple[Optional[str], ...]] = \
+                _remove_at_indices(sep_axis_indices_set, arg.dim_names)
+
+        sep_shape: List[ExpressionT] = [arg.shape[i] for i in sep_axis_indices]
+        for i, sep_shape_i in enumerate(sep_shape):
+            if not isinstance(sep_shape_i, (int, np.integer)):
+                raise LoopyError(
+                        f"Axis {sep_axis_indices[i]+1} (1-based) of "
+                        f"argument '{arg.name}' is tagged 'sep', but "
+                        "does not have constant length.")
+
+        sep_info = _ArraySeparationInfo(
+                sep_axis_indices_set=sep_axis_indices_set,
+                subarray_names=Map({
+                    ind: vng(f"{arg.name}_s{'_'.join(str(i) for i in ind)}")
+                    for ind in np.ndindex(*cast(List[int], sep_shape))}))
+
+        new_args.append(arg.copy(_separation_info=sep_info))
+
+        for san in sorted(sep_info.subarray_names.values()):
+            new_args.append(
+                    arg.copy(
+                        name=san,
+                        shape=new_shape,
+                        dim_tags=new_dim_tags,
+                        dim_names=new_dim_names))
+
+    # }}}
+
+    if not made_changes:
+        return kernel
+
+    kernel = kernel.copy(args=new_args)
+
+    return kernel
+
+# }}}
+
+
+# {{{ make temporary variables for offsets and strides
+
+def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
+    additional_args: List[KernelArgument] = []
+
+    vng = kernel.get_var_name_generator()
+
+    from pymbolic.primitives import Expression, Variable
+    from loopy.kernel.array import FixedStrideArrayDimTag
+
+    # {{{ process arguments
+
+    new_args = []
+    for arg in kernel.args:
+        if isinstance(arg, ArrayArg) and not arg._separation_info:
+            what = f"offset for argument '{arg.name}'"
+            if arg.offset is None:
+                pass
+            if arg.offset is auto:
+                offset_name = vng(arg.name+"_offset")
+                additional_args.append(ValueArg(
+                        offset_name, kernel.index_dtype))
+                arg = arg.copy(offset=offset_name)
+            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+                pass
+            else:
+                raise LoopyError(f"invalid value of {what}")
+
+            if arg.dim_tags is None:
+                new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]]  \
+                        = arg.dim_tags
+            else:
+                new_dim_tags = ()
+                for iaxis, dim_tag in enumerate(arg.dim_tags):
+                    if isinstance(dim_tag, FixedStrideArrayDimTag):
+                        what = ("axis stride for axis "
+                                f"{iaxis+1} (1-based) of '{arg.name}'")
+                        if dim_tag.stride is auto:
+                            stride_name = vng(f"{arg.name}_stride{iaxis}")
+                            dim_tag = dim_tag.copy(stride=Variable(stride_name))
+                            additional_args.append(ValueArg(
+                                    stride_name, kernel.index_dtype))
+                        elif isinstance(
+                                dim_tag.stride, (int, np.integer, Expression)):
+                            pass
+                        else:
+                            raise LoopyError(f"invalid value of {what}")
+
+                    new_dim_tags = new_dim_tags + (dim_tag,)
+
+            arg = arg.copy(dim_tags=new_dim_tags)
+
+        new_args.append(arg)
+
+    # }}}
+
+    if not additional_args:
+        return kernel
+    else:
+        return kernel.copy(args=new_args + additional_args)
+
+# }}}
+
+
+# {{{ zero_offsets
+
+def zero_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
+    made_changes = False
+    from pymbolic.primitives import Expression
+
+    # {{{ process arguments
+
+    new_args = []
+    for arg in kernel.args:
+        if isinstance(arg, ArrayArg):
+            if arg.offset is None:
+                pass
+            if arg.offset is auto:
+                made_changes = True
+                arg = arg.copy(offset=0)
+            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+                from pymbolic.primitives import is_zero
+                if not is_zero(arg.offset):
+                    raise LoopyError(
+                        f"Non-zero offset on argument '{arg.name}' "
+                        f"of callable kernel '{kernel.name}. This is not allowed.")
+            else:
+                raise LoopyError(f"invalid value of offset for '{arg.name}'")
+
+        new_args.append(arg)
+
+    # }}}
+
+    if not made_changes:
+        return kernel
+    else:
+        return kernel.copy(args=new_args)
+
+# }}}
+
+
 # {{{ decide temporary address space
 
 def _get_compute_inames_tagged(kernel, insn, tag_base):
@@ -129,16 +329,8 @@ def find_temporary_address_space(kernel):
 
     base_storage_to_aliases = {}
 
-    kernel_var_names = kernel.all_variable_names(include_temp_storage=False)
-
     for temp_var in kernel.temporary_variables.values():
         if temp_var.base_storage is not None:
-            # no nesting allowed
-            if temp_var.base_storage in kernel_var_names:
-                raise LoopyError("base_storage for temporary '%s' is '%s', "
-                        "which is an existing variable name"
-                        % (temp_var.name, temp_var.base_storage))
-
             base_storage_to_aliases.setdefault(
                     temp_var.base_storage, []).append(temp_var.name)
 
@@ -535,11 +727,19 @@ def filter_reachable_callables(t_unit):
     return t_unit.copy(callables_table=new_callables)
 
 
-def _preprocess_single_kernel(kernel, callables_table):
+def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKernel:
     from loopy.kernel import KernelState
 
     prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name)
 
+    kernel = make_arrays_for_sep_arrays(kernel)
+
+    if is_entrypoint:
+        kernel = make_args_for_offsets_and_strides(kernel)
+    else:
+        # No need for offsets internally, we can pass arbitrary pointers.
+        kernel = zero_offsets_and_strides(kernel)
+
     from loopy.check import check_identifiers_in_subst_rules
     check_identifiers_in_subst_rules(kernel)
 
@@ -569,6 +769,10 @@ def _preprocess_single_kernel(kernel, callables_table):
 
     kernel = find_temporary_address_space(kernel)
 
+    # Ordering restriction: temporary address spaces need to be found before
+    # allocating base_storage
+    kernel = allocate_temporaries_for_base_storage(kernel, _implicitly_run=True)
+
     # check for atomic loads, much easier to do here now that the dependencies
     # have been established
     kernel = check_atomic_loads(kernel)
@@ -637,7 +841,8 @@ def preprocess_program(program):
     for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = _preprocess_single_kernel(
-                    in_knl_callable.subkernel, program.callables_table)
+                    in_knl_callable.subkernel,
+                    is_entrypoint=func_id in program.entrypoints)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
         elif isinstance(in_knl_callable, ScalarCallable):
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index b46f04826d27cb7f2503f919c42bbadd3b4ecaf5..9dc3bdc365219ccad8e580b2250d379fe611942a 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -20,9 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import sys
+from dataclasses import dataclass, replace
+from typing import Any, TypeVar
 
 from pytools import ImmutableRecord
-import sys
 import islpy as isl
 from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel
 
@@ -50,17 +52,15 @@ __doc__ = """
 """
 
 
-# {{{ schedule items
+SelfT = TypeVar("SelfT")
 
-class ScheduleItem(ImmutableRecord):
-    __slots__ = []
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-        """
-        for field_name in self.hash_fields:
-            key_builder.rec(key_hash, getattr(self, field_name))
+# {{{ schedule items
+
+@dataclass(frozen=True)
+class ScheduleItem:
+    def copy(self: SelfT, **kwargs: Any) -> SelfT:
+        return replace(self, **kwargs)
 
 
 class BeginBlockItem(ScheduleItem):
@@ -71,26 +71,32 @@ class EndBlockItem(ScheduleItem):
     pass
 
 
+@dataclass(frozen=True)
 class EnterLoop(BeginBlockItem):
-    hash_fields = __slots__ = ["iname"]
+    iname: str
 
 
+@dataclass(frozen=True)
 class LeaveLoop(EndBlockItem):
-    hash_fields = __slots__ = ["iname"]
+    iname: str
 
 
+@dataclass(frozen=True)
 class RunInstruction(ScheduleItem):
-    hash_fields = __slots__ = ["insn_id"]
+    insn_id: str
 
 
+@dataclass(frozen=True)
 class CallKernel(BeginBlockItem):
-    hash_fields = __slots__ = ["kernel_name", "extra_args", "extra_inames"]
+    kernel_name: str
 
 
+@dataclass(frozen=True)
 class ReturnFromKernel(EndBlockItem):
-    hash_fields = __slots__ = ["kernel_name"]
+    kernel_name: str
 
 
+@dataclass(frozen=True)
 class Barrier(ScheduleItem):
     """
     .. attribute:: comment
@@ -107,9 +113,10 @@ class Barrier(ScheduleItem):
 
     .. attribute:: originating_insn_id
     """
-
-    hash_fields = ["comment", "synchronization_kind", "mem_kind"]
-    __slots__ = hash_fields + ["originating_insn_id"]
+    comment: str
+    synchronization_kind: str
+    mem_kind: str
+    originating_insn_id: str
 
 # }}}
 
@@ -450,11 +457,7 @@ def dump_schedule(kernel, schedule):
             indent = indent[:-4]
             lines.append(indent + "end %s" % sched_item.iname)
         elif isinstance(sched_item, CallKernel):
-            lines.append(indent +
-                         "CALL KERNEL {}(extra_args={}, extra_inames={})".format(
-                             sched_item.kernel_name,
-                             sched_item.extra_args,
-                             sched_item.extra_inames))
+            lines.append(indent + f"CALL KERNEL {sched_item.kernel_name}")
             indent += "    "
         elif isinstance(sched_item, ReturnFromKernel):
             indent = indent[:-4]
@@ -2102,8 +2105,6 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args=None):
                 # Device mapper only gets run once.
                 new_kernel = map_schedule_onto_host_or_device(new_kernel)
 
-            from loopy.schedule.tools import add_extra_args_to_schedule
-            new_kernel = add_extra_args_to_schedule(new_kernel)
             yield new_kernel
 
             debug.start()
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 35c73b775395f488df4a2e46fdae35c8cc434948..38d03a2960487c023c8a6be3d8f493d59a3a0db1 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -21,7 +21,7 @@ THE SOFTWARE.
 """
 
 from loopy.diagnostic import LoopyError
-from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop,
+from loopy.schedule import (Barrier, CallKernel, EnterLoop,
                             ReturnFromKernel, RunInstruction)
 from loopy.schedule.tools import get_block_boundaries
 
@@ -40,9 +40,7 @@ def map_schedule_onto_host_or_device(kernel):
 
     if not kernel.target.split_kernel_at_global_barriers():
         new_schedule = (
-            [CallKernel(kernel_name=device_prog_name_gen(),
-                        extra_args=[],
-                        extra_inames=[])] +
+            [CallKernel(kernel_name=device_prog_name_gen())] +
             list(kernel.linearization) +
             [ReturnFromKernel(kernel_name=kernel.name)])
         kernel = kernel.copy(linearization=new_schedule)
@@ -59,7 +57,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
 
     # {{{ inner mapper function
 
-    dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[])
+    dummy_call = CallKernel(kernel_name="")
     dummy_return = ReturnFromKernel(kernel_name="")
 
     def inner_mapper(start_idx, end_idx, new_schedule):
@@ -143,22 +141,14 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
             new_schedule +
             [dummy_return.copy()])
 
-    # Assign names, extra_inames to CallKernel / ReturnFromKernel instructions
-    inames = []
+    # Assign names to CallKernel / ReturnFromKernel instructions
 
     for idx, sched_item in enumerate(new_schedule):
         if isinstance(sched_item, CallKernel):
             last_kernel_name = device_prog_name_gen()
-            new_schedule[idx] = sched_item.copy(
-                kernel_name=last_kernel_name,
-                extra_inames=list(inames))
+            new_schedule[idx] = sched_item.copy(kernel_name=last_kernel_name)
         elif isinstance(sched_item, ReturnFromKernel):
-            new_schedule[idx] = sched_item.copy(
-                kernel_name=last_kernel_name)
-        elif isinstance(sched_item, EnterLoop):
-            inames.append(sched_item.iname)
-        elif isinstance(sched_item, LeaveLoop):
-            inames.pop()
+            new_schedule[idx] = sched_item.copy(kernel_name=last_kernel_name)
 
     new_kernel = kernel.copy(linearization=new_schedule)
 
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index d021b0aa43edf9c9f3373857b63b7cb89a83032d..f2164b6dd8c328b826376015d6653418e694ca9d 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -20,10 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from loopy.kernel.data import AddressSpace
+from functools import cached_property
+import enum
+from typing import Sequence, FrozenSet, Tuple, List, Set, Dict
+from dataclasses import dataclass
+
 from pytools import memoize_method
 import islpy as isl
-import enum
+
+from loopy.kernel.data import AddressSpace, TemporaryVariable, ArrayArg
+from loopy.kernel import LoopKernel
 
 
 # {{{ block boundary finder
@@ -51,60 +57,244 @@ def get_block_boundaries(schedule):
 
 # {{{ subkernel tools
 
-def temporaries_read_in_subkernel(kernel, subkernel):
+def temporaries_read_in_subkernel(
+        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
-    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
+    inames = frozenset().union(*(kernel.insn_inames(insn_id)
+                                 for insn_id in insn_ids))
+    domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames}
+    params = frozenset().union(*(
+        kernel.domains[dom_idx].get_var_names(isl.dim_type.param)
+        for dom_idx in domain_idxs))
+
+    return (frozenset(tv
+                for insn_id in insn_ids
+                for tv in kernel.id_to_insn[insn_id].read_dependency_names()
+                if tv in kernel.temporary_variables)
+            | (params & frozenset(kernel.temporary_variables)))
+
+
+def temporaries_written_in_subkernel(
+        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
     return frozenset(tv
-            for insn_id in insn_ids
-            for tv in kernel.id_to_insn[insn_id].read_dependency_names()
-            if tv in kernel.temporary_variables)
+                for insn_id in insn_ids
+                for tv in kernel.id_to_insn[insn_id].assignee_var_names()
+                if tv in kernel.temporary_variables)
 
 
-def temporaries_written_in_subkernel(kernel, subkernel):
+def args_read_in_subkernel(
+        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
-    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
-    return frozenset(tv
-            for insn_id in insn_ids
-            for tv in kernel.id_to_insn[insn_id].write_dependency_names()
-            if tv in kernel.temporary_variables)
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
+    inames = frozenset().union(*(kernel.insn_inames(insn_id)
+                                 for insn_id in insn_ids))
+    domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames}
+    params = frozenset().union(*(
+        kernel.domains[dom_idx].get_var_names(isl.dim_type.param)
+        for dom_idx in domain_idxs))
+    return (frozenset(arg
+                for insn_id in insn_ids
+                for arg in kernel.id_to_insn[insn_id].read_dependency_names()
+                if arg in kernel.arg_dict)
+            | (params & frozenset(kernel.arg_dict)))
+
+
+def args_written_in_subkernel(
+        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
+    return frozenset(arg
+                for insn_id in insn_ids
+                for arg in kernel.id_to_insn[insn_id].assignee_var_names()
+                if arg in kernel.arg_dict)
+
+
+def supporting_temporary_names(
+        kernel: LoopKernel, tv_names: FrozenSet[str]) -> FrozenSet[str]:
+    result: Set[str] = set()
+
+    for name in tv_names:
+        tv = kernel.temporary_variables[name]
+        for supp_name in tv.supporting_names():
+            if supp_name in kernel.temporary_variables:
+                result.add(supp_name)
+
+    return frozenset(result)
 
 # }}}
 
 
-# {{{ add extra args to schedule
+# {{{ argument lists
 
-def add_extra_args_to_schedule(kernel):
-    """
-    Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel`
-    instructions in the schedule with global temporaries.
-    """
-    new_schedule = []
-    from loopy.schedule import CallKernel
+@dataclass(frozen=True)
+class KernelArgInfo:
+    passed_arg_names: Sequence[str]
+
+    written_names: FrozenSet[str]
+
+    @property
+    def passed_names(self) -> Sequence[str]:
+        return self.passed_arg_names
+
+
+@dataclass(frozen=True)
+class SubKernelArgInfo(KernelArgInfo):
+    passed_inames: Sequence[str]
+    passed_temporaries: Sequence[str]
+
+    @property
+    def passed_names(self) -> Sequence[str]:
+        return (list(self.passed_arg_names)
+                + list(self.passed_inames)
+                + list(self.passed_temporaries))
+
+
+def _should_temp_var_be_passed(tv: TemporaryVariable) -> bool:
+    return tv.address_space == AddressSpace.GLOBAL and tv.initializer is None
+
+
+class _SupportingNameTracker:
+    def __init__(self, kernel: LoopKernel):
+        self.kernel = kernel
+        self.name_to_main_name: Dict[str, str] = {}
+
+    def add_supporting_names_for(self, name):
+        var_descr = self.kernel.get_var_descriptor(name)
+        for supp_name in var_descr.supporting_names():
+            self.name_to_main_name[supp_name] = (
+                    self.name_to_main_name.get(supp_name, frozenset())
+                    | {name})
+
+    def get_additional_args_and_tvs(
+            self, already_passed: Set[str]
+            ) -> Tuple[List[str], List[str]]:
+        additional_args = []
+        additional_temporaries = []
+
+        for supporting_name in sorted(frozenset(self.name_to_main_name)):
+            if supporting_name not in already_passed:
+                already_passed.add(supporting_name)
+                var_descr = self.kernel.get_var_descriptor(supporting_name)
+                if isinstance(var_descr, TemporaryVariable):
+                    if _should_temp_var_be_passed(var_descr):
+                        additional_temporaries.append(supporting_name)
+                else:
+                    additional_args.append(supporting_name)
+
+        return additional_args, additional_temporaries
+
+
+def _process_args_for_arg_info(
+        kernel: LoopKernel, args_read: Set[str], args_written: Set[str],
+        supp_name_tracker: _SupportingNameTracker, used_only: bool,
+        ) -> List[str]:
+
+    args_expected: Set[str] = set()
+
+    passed_arg_names = []
+    for arg in kernel.args:
+        if used_only and not (arg.name in args_read or arg.name in args_written):
+            continue
+
+        try:
+            args_expected.remove(arg.name)
+        except KeyError:
+            pass
+
+        # Disregard the original array if it had a sep-tagged axis.
+        if isinstance(arg, ArrayArg):
+            if not arg._separation_info:
+                passed_arg_names.append(arg.name)
+                supp_name_tracker.add_supporting_names_for(arg.name)
+            else:
+                for sep_name in sorted(arg._separation_info.subarray_names.values()):
+                    # Separated arrays occur later in the argument list.
+                    # Mark them as accessed if the original array was,
+                    # we'll stumble on them when it is their turn.
+                    # Add them to args_expected to ensure they're not missed.
+                    if arg.name in args_read:
+                        args_read.add(sep_name)
+                        args_expected.add(sep_name)
+                    if arg.name in args_written:
+                        args_written.add(sep_name)
+                        args_expected.add(sep_name)
 
-    for sched_item in kernel.linearization:
-        if isinstance(sched_item, CallKernel):
-            subkernel = sched_item.kernel_name
-
-            used_temporaries = (
-                    temporaries_read_in_subkernel(kernel, subkernel)
-                    | temporaries_written_in_subkernel(kernel, subkernel))
-
-            more_args = {tv
-                    for tv in used_temporaries
-                    if
-                    kernel.temporary_variables[tv].address_space
-                    == AddressSpace.GLOBAL
-                    and
-                    kernel.temporary_variables[tv].initializer is None
-                    and
-                    tv not in sched_item.extra_args}
-
-            new_schedule.append(sched_item.copy(
-                    extra_args=sched_item.extra_args + sorted(more_args)))
         else:
-            new_schedule.append(sched_item)
+            passed_arg_names.append(arg.name)
+            supp_name_tracker.add_supporting_names_for(arg.name)
+
+    assert not args_expected
+
+    return passed_arg_names
+
+
+def get_kernel_arg_info(kernel: LoopKernel) -> KernelArgInfo:
+    args_written = set(kernel.arg_dict) & kernel.get_written_variables()
+
+    supp_name_tracker = _SupportingNameTracker(kernel)
+
+    passed_arg_names = _process_args_for_arg_info(kernel,
+            args_read=set(), args_written=args_written,
+            supp_name_tracker=supp_name_tracker,
+            used_only=False)
+
+    additional_args, additional_temporaries = \
+            supp_name_tracker.get_additional_args_and_tvs(
+                    already_passed=(
+                        set(passed_arg_names)))
+
+    assert not additional_temporaries
+
+    return KernelArgInfo(
+            passed_arg_names=passed_arg_names + additional_args,
+            written_names=frozenset(args_written))
+
+
+def get_subkernel_arg_info(
+        kernel: LoopKernel, subkernel_name: str) -> SubKernelArgInfo:
+    assert kernel.linearization is not None
 
-    return kernel.copy(linearization=new_schedule)
+    args_read = set(args_read_in_subkernel(kernel, subkernel_name))
+    args_written = set(args_written_in_subkernel(kernel, subkernel_name))
+
+    tvs_read = temporaries_read_in_subkernel(kernel, subkernel_name)
+    tvs_written = set(temporaries_written_in_subkernel(kernel, subkernel_name))
+
+    supp_name_tracker = _SupportingNameTracker(kernel)
+
+    passed_arg_names = _process_args_for_arg_info(kernel,
+            args_read=args_read, args_written=args_written,
+            supp_name_tracker=supp_name_tracker,
+            used_only=True)
+
+    passed_temporaries: List[str] = []
+    for tv_name in sorted(tvs_read | tvs_written):
+        supp_name_tracker.add_supporting_names_for(tv_name)
+        tv = kernel.temporary_variables[tv_name]
+
+        if _should_temp_var_be_passed(tv):
+            if tv.base_storage:
+                if tv_name in tvs_written:
+                    if tv_name in tvs_written:
+                        tvs_written.add(tv.base_storage)
+            else:
+                passed_temporaries.append(tv.name)
+
+    additional_args, additional_temporaries = \
+            supp_name_tracker.get_additional_args_and_tvs(
+                    already_passed=(
+                        set(passed_arg_names) | set(passed_temporaries)))
+
+    from loopy.kernel.tools import get_subkernel_extra_inames
+
+    return SubKernelArgInfo(
+            passed_arg_names=passed_arg_names + additional_args,
+            passed_inames=sorted(get_subkernel_extra_inames(kernel)[subkernel_name]),
+            passed_temporaries=passed_temporaries + additional_temporaries,
+            written_names=frozenset(args_written | tvs_written))
 
 # }}}
 
@@ -287,8 +477,7 @@ class WriteRaceChecker:
         self.kernel = kernel
         self.callables_table = callables_table
 
-    @property
-    @memoize_method
+    @cached_property
     def vars(self):
         return (self.kernel.get_written_variables()
                 | self.kernel.get_read_variables())
@@ -364,3 +553,5 @@ class WriteRaceChecker:
                                        self.kernel, self.callables_table)
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 913400eb34285beba63fd305b3e2524353610232..bdcdb0878a4b0249da38d80757cf540d29d8a597 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -25,10 +25,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import loopy as lp
+from functools import partial, cached_property
+
 from islpy import dim_type
 import islpy as isl
 from pymbolic.mapper import CombineMapper
+
+import loopy as lp
 from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, AddressSpace)
 from loopy.diagnostic import warn_with_kernel, LoopyError
@@ -36,7 +39,6 @@ from loopy.symbolic import CoefficientCollector
 from pytools import ImmutableRecord, memoize_method
 from loopy.kernel.function_interface import CallableKernel
 from loopy.translation_unit import TranslationUnit
-from functools import partial
 
 
 __doc__ = """
@@ -813,8 +815,7 @@ class CounterBase(CombineMapper):
         self.zero = get_kernel_zero_pwqpolynomial(self.knl)
         self.one = self.zero + 1
 
-    @property
-    @memoize_method
+    @cached_property
     def param_space(self):
         return get_kernel_parameter_space(self.knl)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index c1708adfd772e5ca8f2282e7cbade29c07f07cc1..2226dc3e467173db454df9a8c7540a3b9af89982 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -24,12 +24,19 @@ THE SOFTWARE.
 """
 
 
-from functools import reduce
+from typing import ClassVar, Tuple
+from functools import reduce, cached_property
 from sys import intern
+import re
+
+import numpy as np
 
-from pytools import memoize, memoize_method, memoize_on_first_arg, ImmutableRecord
+from pytools import (memoize, memoize_method, memoize_on_first_arg,
+        ImmutableRecord)
 import pytools.lex
 from pytools.tag import Taggable
+import islpy as isl
+from islpy import dim_type
 
 import pymbolic.primitives as p
 
@@ -64,12 +71,6 @@ from loopy.diagnostic import (ExpressionToAffineConversionError,
                               UnableToDetermineAccessRangeError)
 
 
-import islpy as isl
-from islpy import dim_type
-
-import re
-import numpy as np
-
 __doc__ = """
 .. currentmodule:: loopy.symbolic
 
@@ -592,7 +593,7 @@ class LocalHardwareAxisIndex(HardwareAxisIndex):
 class FunctionIdentifier(LoopyExpressionBase):
     """A base class for symbols representing functions."""
 
-    init_arg_names = ()
+    init_arg_names: ClassVar[Tuple[str, ...]] = ()
 
     mapper_method = intern("map_loopy_function_identifier")
 
@@ -801,8 +802,7 @@ class Reduction(LoopyExpressionBase):
     def is_tuple_typed(self):
         return self.operation.arg_count > 1
 
-    @property
-    @memoize_method
+    @cached_property
     def inames_set(self):
         return set(self.inames)
 
@@ -2680,8 +2680,7 @@ class AccessRangeOverlapChecker:
     def __init__(self, kernel):
         self.kernel = kernel
 
-    @property
-    @memoize_method
+    @cached_property
     def vars(self):
         return (self.kernel.get_written_variables()
                 | self.kernel.get_read_variables())
@@ -2722,7 +2721,7 @@ class AccessRangeOverlapChecker:
             if access_dir == "any":
                 return var_name in insn.dependency_names()
             else:
-                return var_name in insn.write_dependency_names()
+                return var_name in insn.assignee_var_names()
 
         return self._get_access_ranges(insn_id, access_dir)[var_name]
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index a6357a12b61d2b837ab1cd016554c2c5af100024..e38624b4316b3e4b7855e4283faf6b1228a0bb69 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -1,4 +1,27 @@
-"""Base target interface."""
+"""
+.. currentmodule:: loopy
+
+.. autoclass:: TargetBase
+.. autoclass:: ASTBuilderBase
+.. autoclass:: CFamilyTarget
+.. autoclass:: CTarget
+.. autoclass:: ExecutableCTarget
+.. autoclass:: CudaTarget
+.. autoclass:: OpenCLTarget
+.. autoclass:: PyOpenCLTarget
+.. autoclass:: ISPCTarget
+
+References to Canonical Names
+-----------------------------
+
+.. currentmodule:: loopy.target
+
+.. class:: TargetBase
+
+    See :class:`loopy.TargetBase`.
+"""
+
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -23,34 +46,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-__doc__ = """
 
-.. currentmodule:: loopy
+from typing import (Any, Tuple, Generic, TypeVar, Sequence, ClassVar, Optional,
+        TYPE_CHECKING)
 
-.. autoclass:: TargetBase
-.. autoclass:: ASTBuilderBase
-.. autoclass:: CFamilyTarget
-.. autoclass:: CTarget
-.. autoclass:: ExecutableCTarget
-.. autoclass:: CudaTarget
-.. autoclass:: OpenCLTarget
-.. autoclass:: PyOpenCLTarget
-.. autoclass:: ISPCTarget
-.. autoclass:: NumbaTarget
-.. autoclass:: NumbaCudaTarget
-
-References to Canonical Names
------------------------------
+if TYPE_CHECKING:
+    from loopy.typing import ExpressionT
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
 
-.. currentmodule:: loopy.target
 
-.. class:: TargetBase
-
-    See :class:`loopy.TargetBase`.
-"""
+ASTType = TypeVar("ASTType")
 
 
-class TargetBase:
+class TargetBase():
     """Base class for all targets, i.e. different combinations of code that
     loopy can generate.
 
@@ -59,8 +68,8 @@ class TargetBase:
 
     # {{{ persistent hashing
 
-    hash_fields = ()
-    comparison_fields = ()
+    hash_fields: ClassVar[Tuple[str, ...]] = ()
+    comparison_fields: ClassVar[Tuple[str, ...]] = ()
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_hash.update(type(self).__name__.encode())
@@ -100,20 +109,20 @@ class TargetBase:
     device_program_name_prefix = ""
     device_program_name_suffix = ""
 
-    def split_kernel_at_global_barriers(self):
+    def split_kernel_at_global_barriers(self) -> bool:
         """
         :returns: a :class:`bool` indicating whether the kernel should
             be split when a global barrier is encountered.
         """
         raise NotImplementedError()
 
-    def get_host_ast_builder(self):
+    def get_host_ast_builder(self) -> ASTBuilderBase[Any]:
         """
         :returns: a class implementing :class:`ASTBuilderBase` for the host code
         """
         raise NotImplementedError()
 
-    def get_device_ast_builder(self):
+    def get_device_ast_builder(self) -> ASTBuilderBase[Any]:
         """
         :returns: a class implementing :class:`ASTBuilderBase` for the host code
         """
@@ -151,7 +160,7 @@ class TargetBase:
         raise NotImplementedError()
 
 
-class ASTBuilderBase:
+class ASTBuilderBase(Generic[ASTType]):
     """An interface for generating (host or device) ASTs.
     """
 
@@ -184,28 +193,38 @@ class ASTBuilderBase:
     def ast_module(self):
         raise NotImplementedError()
 
-    def get_function_definition(self, codegen_state, codegen_result,
-            schedule_index, function_decl, function_body):
+    def get_function_definition(
+            self, codegen_state: CodeGenerationState,
+            codegen_result: CodeGenerationResult,
+            schedule_index: int, function_decl: ASTType, function_body: ASTType
+            ) -> ASTType:
         raise NotImplementedError
 
-    def get_function_declaration(self, codegen_state, codegen_result,
-            schedule_index):
+    def get_function_declaration(
+            self, codegen_state: CodeGenerationState,
+            codegen_result: CodeGenerationResult, schedule_index: int
+            ) -> ASTType:
         raise NotImplementedError
 
-    def generate_top_of_body(self, codegen_state):
+    def generate_top_of_body(
+            self, codegen_state: CodeGenerationState) -> Sequence[ASTType]:
         return []
 
-    def get_temporary_decls(self, codegen_state, schedule_index):
+    def get_temporary_decls(self, codegen_state: CodeGenerationState,
+            schedule_index: int) -> ASTType:
         raise NotImplementedError
 
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
-        raise NotImplementedError
+    def get_kernel_call(self, codegen_state: CodeGenerationState,
+            subkernel_name: str,
+            gsize: Tuple[ExpressionT, ...],
+            lsize: Tuple[ExpressionT, ...]) -> Optional[ASTType]:
+        raise NotImplementedError()
 
     @property
     def ast_block_class(self):
         raise NotImplementedError()
 
-    def get_expression_to_code_mapper(self, codegen_state):
+    def get_expression_to_code_mapper(self, codegen_state: CodeGenerationState):
         raise NotImplementedError()
 
     def add_vector_access(self, access_expr, index):
@@ -218,15 +237,6 @@ class ASTBuilderBase:
         """
         raise NotImplementedError()
 
-    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
-        raise NotImplementedError()
-
-    def get_global_arg_decl(self, name, shape, dtype, is_written):
-        raise NotImplementedError()
-
-    def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
-        raise NotImplementedError()
-
     def emit_assignment(self, codegen_state, insn):
         raise NotImplementedError()
 
@@ -294,7 +304,7 @@ class DummyHostASTBuilder(ASTBuilderBase):
     def get_expression_to_code_mapper(self, codegen_state):
         return _DummyExpressionToCodeMapper()
 
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+    def get_kernel_call(self, codegen_state, name, gsize, lsize):
         return None
 
     @property
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 022308852035e15430d7e715d2e1d5134de5fb5f..9df5da5ca56b1fe3a6d0d5079922fad589a27631 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -23,21 +23,32 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import cast, Tuple, Optional
+import re
+
 import numpy as np  # noqa
-from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
-from loopy.diagnostic import LoopyError, LoopyTypeError
-from cgen import Pointer, NestedDeclarator, Block
+
+from cgen import Pointer, NestedDeclarator, Block, Generable, Declarator, Const
 from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
-from loopy.symbolic import IdentityMapper
-from loopy.types import NumpyType
-from loopy.kernel.function_interface import ScalarCallable
 import pymbolic.primitives as p
+from pytools import memoize_method
 
+from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
+from loopy.diagnostic import LoopyError, LoopyTypeError
+from loopy.symbolic import IdentityMapper
+from loopy.types import NumpyType, LoopyType
+from loopy.typing import ExpressionT
+from loopy.kernel import LoopKernel
+from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
+from loopy.kernel.data import (TemporaryVariable, AddressSpace, ArrayArg,
+        ConstantArg, ImageArg, ValueArg)
+from loopy.kernel.function_interface import ScalarCallable
+from loopy.schedule import CallKernel
 from loopy.tools import remove_common_indentation
-import re
+from loopy.codegen import CodeGenerationState
+from loopy.codegen.result import CodeGenerationResult
 
-from pytools import memoize_method
 
 __doc__ = """
 .. currentmodule loopy.target.c
@@ -250,9 +261,6 @@ def _preamble_generator(preamble_info, func_qualifier="inline"):
 
 # {{{ cgen overrides
 
-from cgen import Declarator
-
-
 class POD(Declarator):
     """A simple declarator: The type is given as a :class:`numpy.dtype`
     and the *name* is given as a string.
@@ -748,7 +756,7 @@ def get_gnu_libc_callables():
 # }}}
 
 
-class CFamilyASTBuilder(ASTBuilderBase):
+class CFamilyASTBuilder(ASTBuilderBase[Generable]):
 
     preamble_function_qualifier = "inline"
 
@@ -777,10 +785,13 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
     # {{{ code generation
 
-    def get_function_definition(self, codegen_state, codegen_result,
-            schedule_index,
-            function_decl, function_body):
+    def get_function_definition(
+            self, codegen_state: CodeGenerationState,
+            codegen_result: CodeGenerationResult,
+            schedule_index: int, function_decl: Generable, function_body: Generable
+            ) -> Generable:
         kernel = codegen_state.kernel
+        assert kernel.linearization is not None
 
         from cgen import (
                 FunctionBody,
@@ -811,12 +822,8 @@ class CFamilyASTBuilder(ASTBuilderBase):
                         tv.initializer is not None):
                     assert tv.read_only
 
-                    decl_info, = tv.decl_info(self.target,
-                                    index_dtype=kernel.index_dtype)
                     decl = self.wrap_global_constant(
-                            self.get_temporary_decl(
-                                codegen_state, schedule_index, tv,
-                                decl_info))
+                            self.get_temporary_var_declarator(codegen_state, tv))
 
                     if tv.initializer is not None:
                         decl = Initializer(decl, generate_array_literal(
@@ -830,29 +837,16 @@ class CFamilyASTBuilder(ASTBuilderBase):
         else:
             return Collection(result+[Line(), fbody])
 
-    def idi_to_cgen_declarator(self, kernel, idi):
-        from loopy.kernel.data import InameArg
-        if (idi.offset_for_name is not None
-                or idi.stride_for_name_and_axis is not None):
-            assert not idi.is_written
-            from cgen import Const
-            return Const(POD(self, idi.dtype, idi.name))
-        elif issubclass(idi.arg_class, InameArg):
-            return InameArg(idi.name, idi.dtype).get_arg_decl(self)
-        else:
-            name = idi.base_name or idi.name
-            var_descr = kernel.get_var_descriptor(name)
-            from loopy.kernel.data import ArrayBase
-            if isinstance(var_descr, ArrayBase):
-                return var_descr.get_arg_decl(
-                        self,
-                        idi.name[len(name):], idi.shape, idi.dtype,
-                        idi.is_written)
-            else:
-                return var_descr.get_arg_decl(self)
+    def get_function_declaration(self, codegen_state: CodeGenerationState,
+            codegen_result: CodeGenerationResult, schedule_index: int) -> Generable:
+        kernel = codegen_state.kernel
+
+        assert codegen_state.kernel.linearization is not None
+        subkernel_name = cast(
+                        CallKernel,
+                        codegen_state.kernel.linearization[schedule_index]
+                        ).kernel_name
 
-    def get_function_declaration(self, codegen_state, codegen_result,
-            schedule_index):
         from cgen import FunctionDeclaration, Value
 
         name = codegen_result.current_program(codegen_state).name
@@ -861,146 +855,110 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
         if codegen_state.is_entrypoint:
             name = Value("void", name)
+
+            # subkernel launches occur only as part of entrypoint kernels for now
+            from loopy.schedule.tools import get_subkernel_arg_info
+            skai = get_subkernel_arg_info(kernel, subkernel_name)
+            passed_names = skai.passed_names
+            written_names = skai.written_names
         else:
             name = Value("static void", name)
+            passed_names = [arg.name for arg in kernel.args]
+            written_names = kernel.get_written_variables()
+
         return FunctionDeclarationWrapper(
                 FunctionDeclaration(
                     name,
-                    [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
-                        for idi in codegen_state.implemented_data_info]))
-
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+                    [self.arg_to_cgen_declarator(
+                            kernel, arg_name,
+                            is_written=arg_name in written_names)
+                        for arg_name in passed_names]))
+
+    def get_kernel_call(self, codegen_state: CodeGenerationState,
+            subkernel_name: str,
+            gsize: Tuple[ExpressionT, ...],
+            lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]:
         return None
 
     def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import AddressSpace
 
         kernel = codegen_state.kernel
+        assert kernel.linearization is not None
 
-        base_storage_decls = []
         temp_decls = []
+        temp_decls_using_base_storage = []
 
         # {{{ declare temporaries
 
-        base_storage_sizes = {}
-        base_storage_to_scope = {}
-        base_storage_to_align_bytes = {}
-
-        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
+        from cgen import Initializer, Line
         # Getting the temporary variables that are needed for the current
         # sub-kernel.
         from loopy.schedule.tools import (
                 temporaries_read_in_subkernel,
-                temporaries_written_in_subkernel)
-        subkernel = kernel.linearization[schedule_index].kernel_name
+                temporaries_written_in_subkernel,
+                supporting_temporary_names)
+        subkernel_name = kernel.linearization[schedule_index].kernel_name
+        sub_knl_temps = (
+                temporaries_read_in_subkernel(kernel, subkernel_name)
+                | temporaries_written_in_subkernel(kernel, subkernel_name))
         sub_knl_temps = (
-                temporaries_read_in_subkernel(kernel, subkernel)
-                | temporaries_written_in_subkernel(kernel, subkernel))
+                sub_knl_temps
+                | supporting_temporary_names(kernel, sub_knl_temps))
+
+        ecm = self.get_expression_to_code_mapper(codegen_state)
 
         for tv in sorted(
                 kernel.temporary_variables.values(),
                 key=lambda key_tv: key_tv.name):
-            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)
-
             if not tv.base_storage:
-                for idi in decl_info:
-                    # global temp vars are mapped to arguments or global declarations
-                    if tv.address_space != AddressSpace.GLOBAL and (
-                            tv.name in sub_knl_temps):
-                        decl = self.wrap_temporary_decl(
-                                self.get_temporary_decl(
-                                    codegen_state, schedule_index, tv, idi),
-                                tv.address_space)
-
-                        if tv.initializer is not None:
-                            assert tv.read_only
-                            decl = Initializer(decl, generate_array_literal(
-                                codegen_state, tv, tv.initializer))
-
-                        temp_decls.append(decl)
+                # global temp vars are mapped to arguments or global
+                # declarations, no need to declare locally.
+                if tv.address_space != AddressSpace.GLOBAL and (
+                        tv.name in sub_knl_temps):
+                    decl = self.get_temporary_var_declarator(codegen_state, tv)
 
-            else:
-                assert tv.initializer is None
-                if (tv.address_space == AddressSpace.GLOBAL
-                        and codegen_state.is_generating_device_code):
-                    # global temps trigger no codegen in the device code
-                    continue
-
-                offset = 0
-                base_storage_sizes.setdefault(tv.base_storage, []).append(
-                        tv.nbytes)
-                base_storage_to_scope.setdefault(tv.base_storage, []).append(
-                        tv.address_space)
-
-                align_size = tv.dtype.itemsize
-
-                from loopy.kernel.array import VectorArrayDimTag
-                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
-                    if isinstance(dim_tag, VectorArrayDimTag):
-                        align_size *= axis_len
-
-                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
-                        align_size)
-
-                for idi in decl_info:
-                    cast_decl = POD(self, idi.dtype, "")
-                    temp_var_decl = POD(self, idi.dtype, idi.name)
-
-                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
-                    temp_var_decl = self.wrap_temporary_decl(
-                            temp_var_decl, tv.address_space)
-
-                    if tv._base_storage_access_may_be_aliasing:
-                        ptrtype = _ConstPointer
-                    else:
-                        # The 'restrict' part of this is a complete lie--of course
-                        # all these temporaries are aliased. But we're promising to
-                        # not use them to shovel data from one representation to the
-                        # other. That counts, right?
-                        ptrtype = _ConstRestrictPointer
-
-                    cast_decl = ptrtype(cast_decl)
-                    temp_var_decl = ptrtype(temp_var_decl)
-
-                    cast_tp, cast_d = cast_decl.get_decl_pair()
-                    temp_var_decl = Initializer(
-                            temp_var_decl,
-                            "({} {}) ({} + {})".format(
-                                " ".join(cast_tp), cast_d,
-                                tv.base_storage,
-                                offset))
-
-                    temp_decls.append(temp_var_decl)
-
-                    from pytools import product
-                    offset += (
-                            idi.dtype.itemsize
-                            * product(si for si in idi.shape))
-
-        ecm = self.get_expression_to_code_mapper(codegen_state)
+                    if tv.initializer is not None:
+                        assert tv.read_only
+                        decl = Initializer(decl, generate_array_literal(
+                            codegen_state, tv, tv.initializer))
 
-        for bs_name, bs_sizes in sorted(base_storage_sizes.items()):
-            bs_var_decl = Value("char", bs_name)
-            from pytools import single_valued
-            bs_var_decl = self.wrap_temporary_decl(
-                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))
+                    temp_decls.append(decl)
 
-            # FIXME: Could try to use isl knowledge to simplify max.
-            if all(isinstance(bs, int) for bs in bs_sizes):
-                bs_size_max = max(bs_sizes)
             else:
-                bs_size_max = p.Max(tuple(bs_sizes))
-
-            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))
+                assert tv.initializer is None
 
-            alignment = max(base_storage_to_align_bytes[bs_name])
-            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)
+                cast_decl = POD(self, tv.dtype, "")
+                temp_var_decl = POD(self, tv.dtype, tv.name)
 
-            base_storage_decls.append(bs_var_decl)
+                if tv._base_storage_access_may_be_aliasing:
+                    ptrtype = _ConstPointer
+                else:
+                    # The 'restrict' part of this is a complete lie--of course
+                    # all these temporaries are aliased. But we're promising to
+                    # not use them to shovel data from one representation to the
+                    # other. That counts, right?
+                    ptrtype = _ConstRestrictPointer
+
+                cast_decl = self.wrap_decl_for_address_space(
+                        ptrtype(cast_decl), tv.address_space)
+                temp_var_decl = self.wrap_decl_for_address_space(
+                        ptrtype(temp_var_decl), tv.address_space)
+
+                cast_tp, cast_d = cast_decl.get_decl_pair()
+                temp_var_decl = Initializer(
+                        temp_var_decl,
+                        "({} {}) ({} + {})".format(
+                            " ".join(cast_tp), cast_d,
+                            tv.base_storage,
+                            ecm(tv.offset)
+                            ))
+
+                temp_decls_using_base_storage.append(temp_var_decl)
 
         # }}}
 
-        result = base_storage_decls + temp_decls
+        result = temp_decls + temp_decls_using_base_storage
 
         if result:
             result.append(Line())
@@ -1018,8 +976,6 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
     # }}}
 
-    # {{{ code generation guts
-
     @property
     def ast_module(self):
         import cgen
@@ -1037,36 +993,18 @@ class CFamilyASTBuilder(ASTBuilderBase):
         from loopy.target.c.codegen.expression import CExpressionToCodeMapper
         return CExpressionToCodeMapper()
 
-    def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
-        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
-
-        if temp_var.read_only:
-            from cgen import Const
-            temp_var_decl = Const(temp_var_decl)
-
-        if decl_info.shape:
-            from cgen import ArrayOf
-            ecm = self.get_expression_to_code_mapper(codegen_state)
-            temp_var_decl = ArrayOf(temp_var_decl,
-                    ecm(p.flattened_product(decl_info.shape),
-                        prec=PREC_NONE, type_context="i"))
+    # {{{ declarators
 
-        if temp_var.alignment:
-            from cgen import AlignedAttribute
-            temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
-
-        return temp_var_decl
-
-    def wrap_temporary_decl(self, decl, scope):
+    def wrap_decl_for_address_space(
+            self, decl: Declarator, address_space: AddressSpace) -> Declarator:
         return decl
 
-    def wrap_global_constant(self, decl):
+    def wrap_global_constant(self, decl: Declarator) -> Declarator:
         from cgen import Static
         return Static(decl)
 
-    def get_value_arg_decl(self, name, shape, dtype, is_written):
-        assert shape == ()
-
+    def get_value_arg_declaraotor(
+            self, name: str, dtype: LoopyType, is_written: bool) -> Declarator:
         result = POD(self, dtype, name)
 
         if not is_written:
@@ -1079,34 +1017,114 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
         return result
 
-    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
-        from cgen import RestrictPointer, Const
+    def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
+        arg_decl = POD(self, ary.dtype, ary.name)
+
+        if ary.dim_tags:
+            for dim_tag in ary.dim_tags:
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    # we're OK with that
+                    pass
+                else:
+                    raise NotImplementedError(
+                        f"{type(self).__name__} does not understand axis tag "
+                        f"'{type(dim_tag)}.")
+
+        return arg_decl
 
-        arg_decl = RestrictPointer(POD(self, dtype, name))
+    def get_array_arg_declarator(
+            self, arg: ArrayArg, is_written: bool) -> Declarator:
+        from cgen import RestrictPointer
+        arg_decl = RestrictPointer(
+                self.wrap_decl_for_address_space(
+                    self.get_array_base_declarator(arg), arg.address_space))
 
         if not is_written:
             arg_decl = Const(arg_decl)
 
         return arg_decl
 
-    def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from warnings import warn
-        warn("get_global_arg_decl is deprecated use get_array_arg_decl "
-                "instead.", DeprecationWarning, stacklevel=2)
-        from loopy.kernel.data import AddressSpace
-        return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape,
-                dtype, is_written)
+    def get_constant_arg_declarator(
+            self, arg: ConstantArg) -> Declarator:
+        from cgen import RestrictPointer
+        return Const(self.wrap_decl_for_address_space(
+            RestrictPointer(
+                self.get_array_base_declarator(arg)), arg.address_space))
+
+    def get_temporary_arg_decl(
+            self, temp_var: TemporaryVariable, is_written: bool) -> Declarator:
+        if temp_var.address_space == AddressSpace.GLOBAL:
+            from cgen import RestrictPointer
+            arg_decl = RestrictPointer(
+                    self.wrap_decl_for_address_space(
+                        self.get_array_base_declarator(temp_var),
+                        temp_var.address_space))
+            if not is_written:
+                arg_decl = Const(arg_decl)
+
+            return arg_decl
+        else:
+            raise LoopyError("unexpected request for argument declaration of "
+                    "non-global temporary")
+
+    def get_image_arg_declarator(
+            self, arg: ImageArg, is_written: bool) -> Declarator:
+        raise NotImplementedError()
+
+    def arg_to_cgen_declarator(
+            self, kernel: LoopKernel, passed_name: str, is_written: bool
+            ) -> Declarator:
+        if passed_name in kernel.all_inames():
+            assert not is_written
+            return self.get_value_arg_declaraotor(
+                    passed_name, kernel.index_dtype, is_written)
+        var_descr = kernel.get_var_descriptor(passed_name)
+        if isinstance(var_descr, ValueArg):
+            return self.get_value_arg_declaraotor(
+                    var_descr.name, var_descr.dtype, is_written)
+        elif isinstance(var_descr, ArrayArg):
+            return self.get_array_arg_declarator(var_descr, is_written)
+        elif isinstance(var_descr, TemporaryVariable):
+            return self.get_temporary_arg_decl(var_descr, is_written)
+        elif isinstance(var_descr, ConstantArg):
+            return self.get_constant_arg_declarator(var_descr)
+        elif isinstance(var_descr, ImageArg):
+            return self.get_image_arg_declarator(var_descr, is_written)
+        else:
+            raise ValueError(f"unexpected type of argument '{passed_name}': "
+                    f"'{type(var_descr)}'")
 
-    def get_constant_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.target.c import POD  # uses the correct complex type
-        from cgen import RestrictPointer, Const
+    def get_temporary_var_declarator(self,
+            codegen_state: CodeGenerationState,
+            temp_var: TemporaryVariable) -> Declarator:
+        temp_var_decl = self.get_array_base_declarator(temp_var)
 
-        arg_decl = RestrictPointer(POD(self, dtype, name))
+        if temp_var.storage_shape:
+            shape = temp_var.storage_shape
+        else:
+            shape = temp_var.shape
 
-        if not is_written:
-            arg_decl = Const(arg_decl)
+        assert isinstance(shape, tuple)
+        assert isinstance(temp_var.dim_tags, tuple)
 
-        return arg_decl
+        from loopy.kernel.array import drop_vec_dims
+        unvec_shape = drop_vec_dims(temp_var.dim_tags, shape)
+
+        if unvec_shape:
+            from cgen import ArrayOf
+            ecm = self.get_expression_to_code_mapper(codegen_state)
+            temp_var_decl = ArrayOf(temp_var_decl,
+                    ecm(p.flattened_product(unvec_shape),
+                        prec=PREC_NONE, type_context="i"))
+
+        if temp_var.alignment:
+            from cgen import AlignedAttribute
+            temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
+
+        return self.wrap_decl_for_address_space(temp_var_decl,
+                temp_var.address_space)
+
+    # }}}
 
     def emit_assignment(self, codegen_state, insn):
         kernel = codegen_state.kernel
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 22a6c8b43bf1695d38ad362b642a12cc1f0fec6b..bcaa4a395a2605fa248f3f6aa1ac35d3a49abf43 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -20,23 +20,46 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import Callable, Any, Union, Tuple, Sequence, Optional
 import tempfile
 import os
+import ctypes
+from dataclasses import dataclass
 
-from loopy.target.execution import (KernelExecutorBase, _KernelInfo,
-                             ExecutionWrapperGeneratorBase, get_highlighted_code)
+from immutables import Map
 from pytools import memoize_method
-from pytools.py_codegen import (Indentation)
+from pytools.codegen import Indentation, CodeGenerator
 from pytools.prefork import ExecError
 from codepy.toolchain import guess_toolchain, ToolchainGuessError, GCCToolchain
 from codepy.jit import compile_from_string
-import ctypes
 
 import numpy as np
 
+from loopy.typing import ExpressionT
+from loopy.types import LoopyType
+from loopy.kernel import LoopKernel
+from loopy.kernel.array import ArrayBase
+from loopy.kernel.data import ArrayArg
+from loopy.schedule.tools import KernelArgInfo
+from loopy.codegen.result import GeneratedProgram
+from loopy.translation_unit import TranslationUnit
+from loopy.target.execution import (KernelExecutorBase,
+                             ExecutionWrapperGeneratorBase, get_highlighted_code)
+
 import logging
 logger = logging.getLogger(__name__)
 
+DEF_EVEN_DIV_FUNCTION = """
+def _lpy_even_div(a, b):
+    result, remdr = divmod(a, b)
+    if remdr != 0:
+        # FIXME: This error message is kind of crummy.
+        raise ValueError("expected even division")
+    return result
+"""
+
+
+# {{{ CExecutionWrapperGenerator
 
 class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     """
@@ -65,20 +88,28 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ handle allocation of unspecified arguements
 
-    def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
+    def handle_alloc(
+            self, gen: CodeGenerator, arg: ArrayArg,
+            strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
+            skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguements for C-execution
         """
         from pymbolic import var
 
-        num_axes = len(arg.unvec_shape)
+        assert isinstance(arg.shape, tuple)
+        num_axes = len(arg.shape)
         for i in range(num_axes):
-            gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
+            gen("_lpy_shape_%d = %s" % (i, strify(arg.shape[i])))
 
-        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+        from loopy.kernel.array import get_strides
+        strides = get_strides(arg)
+        num_axes = len(strides)
+
+        itemsize = arg.dtype.numpy_dtype.itemsize
         for i in range(num_axes):
             gen("_lpy_strides_%d = %s" % (i, strify(
-                itemsize*arg.unvec_strides[i])))
+                itemsize*strides[i])))
 
         if not skip_arg_checks:
             for i in range(num_axes):
@@ -95,7 +126,9 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 for i in range(num_axes))
 
         # find order of array
-        order = "'C'" if (arg.shape == () or arg.unvec_strides[-1] == 1) else "'F'"
+        from loopy.kernel.array import get_strides
+        strides = get_strides(arg)
+        order = "'C'" if (arg.shape == () or strides[-1] == 1) else "'F'"
 
         gen("%(name)s = _lpy_np.empty(%(shape)s, "
                 "%(dtype)s, order=%(order)s)"
@@ -103,7 +136,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                     name=arg.name,
                     shape=strify(sym_shape),
                     dtype=self.python_dtype_str(
-                        gen, kernel_arg.dtype.numpy_dtype),
+                        gen, arg.dtype.numpy_dtype),
                     order=order))
 
         expected_strides = tuple(
@@ -136,6 +169,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         Add default C-imports to preamble
         """
         gen.add_to_preamble("import numpy as _lpy_np")
+        gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION)
 
     def initialize_system_args(self, gen):
         """
@@ -145,44 +179,30 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args,
-            kernel, implemented_data_info):
+    def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
+            kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
         gen("for knl in _lpy_c_kernels:")
         with Indentation(gen):
-            gen("knl({args})".format(
-                args=", ".join(args)))
+            gen(f"knl({', '.join(args)})")
 
     # }}}
 
     # {{{
 
-    def generate_output_handler(
-            self, gen, options, kernel, implemented_data_info):
-
-        from loopy.kernel.data import KernelArgument
-
-        def is_output(idi):
-            from loopy.kernel.array import ArrayBase
-            if not issubclass(idi.arg_class, ArrayBase):
-                return False
-
-            arg = kernel.impl_arg_to_arg[idi.name]
-            return arg.is_output
+    def generate_output_handler(self, gen: CodeGenerator,
+            kernel: LoopKernel, kai: KernelArgInfo) -> None:
+        options = kernel.options
 
         if options.return_dict:
             gen("return None, {%s}"
-                    % ", ".join(f'"{idi.name}": {idi.name}'
-                        for idi in implemented_data_info
-                        if issubclass(idi.arg_class, KernelArgument)
-                        if is_output(idi)))
+                    % ", ".join(f'"{arg_name}": {arg_name}'
+                        for arg_name in kai.passed_arg_names
+                        if kernel.arg_dict[arg_name].is_output))
         else:
-            out_idis = [idi
-                    for idi in implemented_data_info
-                        if issubclass(idi.arg_class, KernelArgument)
-                    if is_output(idi)]
-            if out_idis:
-                gen("return None, (%s,)"
-                        % ", ".join(idi.name for idi in out_idis))
+            out_names = [arg_name for arg_name in kai.passed_arg_names
+                    if kernel.arg_dict[arg_name].is_output]
+            if out_names:
+                gen(f"return None, ({', '.join(out_names)},)")
             else:
                 gen("return None, ()")
 
@@ -197,6 +217,10 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     def get_arg_pass(self, arg):
         return arg.name
 
+# }}}
+
+
+# {{{ CCompiler
 
 class CCompiler:
     """
@@ -304,6 +328,10 @@ class CCompiler:
         # and return compiled
         return ctypes.CDLL(ext_file)
 
+# }}}
+
+
+# {{{ CPlusPlusCompiler
 
 class CPlusPlusCompiler(CCompiler):
     """Subclass of CCompiler to invoke a C++ compiler."""
@@ -319,8 +347,10 @@ class CPlusPlusCompiler(CCompiler):
             libraries=libraries, include_dirs=include_dirs,
             library_dirs=library_dirs, defines=defines, source_suffix=source_suffix)
 
+# }}}
+
 
-# {{{ placeholder till ctypes fixes: bugs.python.org/issue16899
+# {{{ placeholder till ctypes fixes: https://github.com/python/cpython/issues/61103
 
 class Complex64(ctypes.Structure):
     _fields_ = [("real", ctypes.c_float), ("imag", ctypes.c_float)]
@@ -344,28 +374,10 @@ if hasattr(np, "complex256"):
 # }}}
 
 
-class IDIToCDLL:
-    """
-    A utility class that extracts arguement and return type info from a
-    :class:`ImplementedDataInfo` in order to create a :class:`ctype.CDLL`
-    """
-    def __init__(self, target):
-        self.target = target
-        from loopy.target.c import CTarget
-        self.registry = CTarget().get_dtype_registry().wrapped_registry
-
-    def __call__(self, knl, idi):
-        # next loop through the implemented data info to get the arg data
-        arg_info = []
-        for arg in idi:
-            # check if pointer: outputs and arrays must be passed
-            # by reference.
-            pointer = arg.shape or arg.is_written
-            arg_info.append(self._dtype_to_ctype(arg.dtype, pointer))
-
-        return arg_info
-
-    def _dtype_to_ctype(self, dtype, pointer=False):
+# {{{ _args_to_ctypes
+
+def _args_to_ctypes(kernel: LoopKernel, passed_names: Sequence[str]):
+    def _dtype_to_ctype(dtype):
         """Map NumPy dtype to equivalent ctypes type."""
         if dtype.is_complex():
             # complex ctypes aren't exposed
@@ -373,10 +385,23 @@ class IDIToCDLL:
             basetype = _NUMPY_COMPLEX_TYPE_TO_CTYPE[np_dtype]
         else:
             basetype = np.ctypeslib.as_ctypes_type(dtype)
-        if pointer:
-            return ctypes.POINTER(basetype)
         return basetype
 
+    arg_info = []
+    for arg_name in passed_names:
+        arg = kernel.arg_dict[arg_name]
+
+        ctype = _dtype_to_ctype(arg.dtype)
+        if isinstance(arg, ArrayBase):
+            ctype = ctypes.POINTER(ctype)
+        arg_info.append(ctype)
+
+    return arg_info
+
+# }}}
+
+
+# {{{ CompiledCKernel
 
 class CompiledCKernel:
     """
@@ -386,23 +411,19 @@ class CompiledCKernel:
     to automatically map argument types.
     """
 
-    def __init__(self, knl, idi, dev_code, target, comp=None):
-        from loopy.target.c import ExecutableCTarget
-        assert isinstance(target, ExecutableCTarget)
-        self.target = target
-        self.name = knl.name
+    def __init__(self, kernel: LoopKernel, devprog: GeneratedProgram,
+            passed_names: Sequence[str], dev_code: str,
+            comp: Optional["CCompiler"] = None):
         # get code and build
         self.code = dev_code
         self.comp = comp if comp is not None else CCompiler()
-        self.dll = self.comp.build(self.name, self.code)
+        self.dll = self.comp.build(devprog.name, self.code)
 
         # get the function declaration for interface with ctypes
-        func_decl = IDIToCDLL(self.target)
-        arg_info = func_decl(knl, idi)
-        self._fn = getattr(self.dll, self.name)
+        self._fn = getattr(self.dll, devprog.name)
         # kernels are void by defn.
         self._fn.restype = None
-        self._fn.argtypes = [ctype for ctype in arg_info]
+        self._fn.argtypes = _args_to_ctypes(kernel, passed_names)
 
     def __call__(self, *args):
         """Execute kernel with given args mapped to ctypes equivalents."""
@@ -419,6 +440,17 @@ class CompiledCKernel:
             args_.append(arg_)
         self._fn(*args_)
 
+# }}}
+
+
+@dataclass(frozen=True)
+class _KernelInfo:
+    t_unit: TranslationUnit
+    c_kernels: Sequence[CompiledCKernel]
+    invoker: Callable[..., Any]
+
+
+# {{{ CKernelExecutor
 
 class CKernelExecutor(KernelExecutorBase):
     """An object connecting a kernel to a :class:`CompiledKernel`
@@ -428,7 +460,7 @@ class CKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, program, entrypoint, compiler=None):
+    def __init__(self, program, entrypoint, compiler: Optional["CCompiler"] = None):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
@@ -447,30 +479,34 @@ class CKernelExecutor(KernelExecutorBase):
         return CExecutionWrapperGenerator()
 
     @memoize_method
-    def program_info(self, entrypoint, arg_to_dtype_set=frozenset(),
-            all_kwargs=None):
-        program = self.get_typed_and_scheduled_translation_unit(
-                entrypoint, arg_to_dtype_set)
+    def translation_unit_info(
+            self, entrypoint: str,
+            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
+        # FIXME: Remove entrypoint argument
+        assert entrypoint == self.entrypoint
+
+        t_unit = self.get_typed_and_scheduled_translation_unit(
+                entrypoint, arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
-        codegen_result = generate_code_v2(program)
+        codegen_result = generate_code_v2(t_unit)
 
         dev_code = codegen_result.device_code()
         host_code = codegen_result.host_code()
         all_code = "\n".join([dev_code, "", host_code])
 
-        if self.program[entrypoint].options.write_code:
+        if t_unit[entrypoint].options.write_code:
             output = all_code
-            if self.program[entrypoint].options.allow_terminal_colors:
+            if t_unit[entrypoint].options.allow_terminal_colors:
                 output = get_highlighted_code(output)
 
-            if self.program[entrypoint].options.write_code is True:
+            if t_unit[entrypoint].options.write_code is True:
                 print(output)
             else:
-                with open(self.program[entrypoint].options.write_code, "w") as outf:
+                with open(t_unit[entrypoint].options.write_code, "w") as outf:
                     outf.write(output)
 
-        if self.program[entrypoint].options.edit_code:
+        if t_unit[entrypoint].options.edit_code:
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
@@ -478,19 +514,17 @@ class CKernelExecutor(KernelExecutorBase):
 
         c_kernels = []
 
+        from loopy.schedule.tools import get_kernel_arg_info
+        kai = get_kernel_arg_info(t_unit[entrypoint])
         for dp in codegen_result.device_programs:
-            c_kernels.append(CompiledCKernel(dp,
-                codegen_result.implemented_data_infos[entrypoint], all_code,
-                self.program.target, self.compiler))
+            c_kernels.append(CompiledCKernel(
+                t_unit[entrypoint], dp, kai.passed_names, all_code,
+                self.compiler))
 
         return _KernelInfo(
-                program=program,
+                t_unit=t_unit,
                 c_kernels=c_kernels,
-                implemented_data_info=codegen_result.implemented_data_infos[
-                    entrypoint],
-                invoker=self.get_invoker(program, entrypoint, codegen_result))
-
-    # }}}
+                invoker=self.get_invoker(t_unit, entrypoint, codegen_result))
 
     def __call__(self, *args, entrypoint=None, **kwargs):
         """
@@ -510,8 +544,12 @@ class CKernelExecutor(KernelExecutorBase):
         if self.packing_controller is not None:
             kwargs = self.packing_controller(kwargs)
 
-        program_info = self.program_info(entrypoint,
-                self.arg_to_dtype_set(kwargs))
+        program_info = self.translation_unit_info(entrypoint,
+                self.arg_to_dtype(kwargs))
 
         return program_info.invoker(
                 program_info.c_kernels, *args, **kwargs)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 05c6b3b929b8ffac13532f9467b7bf4c0b6e488a..496c75e58bcfcfb142479ec6d9b34dd71ecefa8e 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -163,7 +163,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         from loopy.kernel.array import _apply_offset
                         from loopy.symbolic import simplify_using_aff
 
-                        subscript = _apply_offset(0, expr.name, arg)
+                        subscript = _apply_offset(0, arg)
                         result = self.make_subscript(
                                 arg,
                                 var(expr.name),
@@ -223,7 +223,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         index_tuple = tuple(
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
-        access_info = get_access_info(self.kernel.target, ary, index_tuple,
+        access_info = get_access_info(self.kernel, ary, index_tuple,
                 lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                 self.codegen_state.vectorization_info)
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 5ba5d22ff9ccc286e7d7748d4702e8decb69dd5c..f95dea681904818e15d4a72a809aefad38352533 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -24,15 +24,16 @@ THE SOFTWARE.
 """
 
 import numpy as np
-
+from pymbolic import var
 from pytools import memoize_method
+from cgen import Declarator, Const
 
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.types import NumpyType
-from loopy.kernel.data import AddressSpace
-from pymbolic import var
+from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
+from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg, ArrayArg
 from loopy.kernel.function_interface import ScalarCallable
 
 
@@ -388,55 +389,77 @@ class CUDACASTBuilder(CFamilyASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def wrap_temporary_decl(self, decl, scope):
-        if scope == AddressSpace.LOCAL:
-            from cgen.cuda import CudaShared
+    # }}}
+
+    # {{{ declarators
+
+    def wrap_decl_for_address_space(
+            self, decl: Declarator, address_space: AddressSpace) -> Declarator:
+        from cgen.cuda import CudaGlobal, CudaShared
+        if address_space == AddressSpace.GLOBAL:
+            return CudaGlobal(decl)
+        if address_space == AddressSpace.LOCAL:
             return CudaShared(decl)
-        elif scope == AddressSpace.PRIVATE:
+        elif address_space == AddressSpace.PRIVATE:
             return decl
         else:
-            raise ValueError("unexpected temporary variable scope: %s"
-                    % scope)
+            raise ValueError("unexpected address_space: %s"
+                    % address_space)
 
-    def wrap_global_constant(self, decl):
-        from cgen.cuda import CudaConstant
+    def wrap_global_constant(self, decl: Declarator) -> Declarator:
+        from cgen.cuda import CudaConstant, CudaGlobal
+        assert isinstance(decl, CudaGlobal)
+        decl = decl.subdecl
         return CudaConstant(decl)
 
-    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
-        from loopy.target.c import POD  # uses the correct complex type
-        from cgen import Const
-        from cgen.cuda import CudaRestrictPointer
+    # duplicated in OpenCL, update there if updating here
+    def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
+        dtype = ary.dtype
+
+        vec_size = ary.vector_size(self.target)
+        if vec_size > 1:
+            dtype = self.target.vector_dtype(dtype, vec_size)
+
+        if ary.dim_tags:
+            for dim_tag in ary.dim_tags:
+                if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)):
+                    # we're OK with those
+                    pass
+
+                else:
+                    raise NotImplementedError(
+                        f"{type(self).__name__} does not understand axis tag "
+                        f"'{type(dim_tag)}.")
 
-        arg_decl = CudaRestrictPointer(POD(self, dtype, name))
+        from loopy.target.c import POD
+        return POD(self, dtype, ary.name)
+
+    def get_array_arg_declarator(
+            self, arg: ArrayArg, is_written: bool) -> Declarator:
+        from cgen.cuda import CudaRestrictPointer
+        arg_decl = CudaRestrictPointer(
+                self.wrap_decl_for_address_space(
+                    self.get_array_base_declarator(arg), arg.address_space))
 
         if not is_written:
             arg_decl = Const(arg_decl)
 
         return arg_decl
 
-    def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from warnings import warn
-        warn("get_global_arg_decl is deprecated use get_array_arg_decl "
-                "instead.", DeprecationWarning, stacklevel=2)
-        return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape,
-                dtype, is_written)
-
-    def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
-        raise NotImplementedError("not yet: texture arguments in CUDA")
-
-    def get_constant_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.target.c import POD  # uses the correct complex type
-        from cgen import RestrictPointer, Const
+    def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator:
+        from cgen import RestrictPointer
         from cgen.cuda import CudaConstant
 
-        arg_decl = RestrictPointer(POD(self, dtype, name))
+        # constant *is* an address space as far as CUDA is concerned, do not re-wrap
+        return CudaConstant(RestrictPointer(self.get_array_base_declarator(arg)))
 
-        if not is_written:
-            arg_decl = Const(arg_decl)
+    def get_image_arg_declarator(
+            self, arg: ImageArg, is_written: bool) -> Declarator:
+        raise NotImplementedError("not yet: texture arguments in CUDA")
 
-        return CudaConstant(arg_decl)
+    # }}}
 
-    # {{{ code generation for atomic update
+    # {{{ atomics
 
     def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
@@ -527,8 +550,6 @@ class CUDACASTBuilder(CFamilyASTBuilder):
 
     # }}}
 
-    # }}}
-
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index e911ae89de27725aade963178639026e2b6460cf..131e35396da526e13604c57f846031352928bf5b 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -21,34 +21,35 @@ THE SOFTWARE.
 """
 
 
-import numpy as np
+from typing import (Callable, Tuple, Union, Set, FrozenSet, List, Dict,
+        Optional, Sequence, Any)
+from dataclasses import dataclass
+
+from immutables import Map
+
 from abc import ABC, abstractmethod
-from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
-from pytools.py_codegen import (
-        Indentation, PythonFunctionGenerator)
+from pytools.py_codegen import PythonFunctionGenerator
+from pytools.codegen import Indentation, CodeGenerator
+
+from pymbolic import var
 
 import logging
 logger = logging.getLogger(__name__)
 
 from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
+from loopy.typing import ExpressionT
+from loopy.types import LoopyType, NumpyType
+from loopy.kernel import KernelState, LoopKernel
+from loopy.kernel.data import _ArraySeparationInfo, ArrayArg, auto
+from loopy.translation_unit import TranslationUnit
+from loopy.schedule.tools import KernelArgInfo
 from loopy.version import DATA_MODEL_VERSION
 
 
 # {{{ object array argument packing
 
-class _PackingInfo(ImmutableRecord):
-    """
-    .. attribute:: name
-    .. attribute:: sep_shape
-
-    .. attribute:: subscripts_and_names
-
-        A list of type ``[(index, unpacked_name), ...]``.
-    """
-
-
 class SeparateArrayPackingController:
     """For argument arrays with axes tagged to be implemented as separate
     arrays, this class provides preprocessing of the incoming arguments so that
@@ -59,59 +60,60 @@ class SeparateArrayPackingController:
     It also repacks outgoing arrays of this type back into an object array.
     """
 
-    def __init__(self, packing_info):
-        self.packing_info = packing_info
+    def __init__(self, packing_info: Dict[str, _ArraySeparationInfo]) -> None:
+        # These must work to index tuples if 1D.
+        def untuple_length_1_indices(
+                ind: Tuple[int, ...]) -> Union[int, Tuple[int, ...]]:
+            if len(ind) == 1:
+                return ind[0]
+            else:
+                return ind
 
-    def __call__(self, kernel_kwargs):
-        if not self.packing_info:
-            return kernel_kwargs
+        self.packing_info = {
+                name: {
+                    untuple_length_1_indices(ind): sep_name
+                    for ind, sep_name in sep_info.subarray_names.items()
+                    }
+                for name, sep_info in packing_info.items()
+                }
 
+    def __call__(self, kernel_kwargs: Dict[str, Any]) -> Dict[str, Any]:
         kernel_kwargs = kernel_kwargs.copy()
 
-        for packing_info in self.packing_info.values():
-            arg_name = packing_info.name
-            if packing_info.name in kernel_kwargs:
-                arg = kernel_kwargs[arg_name]
-                for index, unpacked_name in packing_info.subscripts_and_names:
+        for name, ind_to_subary_name in self.packing_info.items():
+            if name in kernel_kwargs:
+                arg = kernel_kwargs[name]
+                for index, unpacked_name in ind_to_subary_name.items():
                     assert unpacked_name not in kernel_kwargs
                     kernel_kwargs[unpacked_name] = arg[index]
-                del kernel_kwargs[arg_name]
+                del kernel_kwargs[name]
 
         return kernel_kwargs
 
+# }}}
 
-def make_packing_controller(program, entrypoint):
-    packing_info = {}
-    from loopy.kernel.array import ArrayBase
-    for arg in program[entrypoint].args:
-        if not isinstance(arg, ArrayBase):
-            continue
 
-        if arg.shape is None or arg.dim_tags is None:
-            continue
+# {{{ ExecutionWrapperGeneratorBase
 
-        subscripts_and_names = arg.subscripts_and_names()
+def _str_to_expr(name_or_expr: Union[str, ExpressionT]) -> ExpressionT:
+    if isinstance(name_or_expr, str):
+        return var(name_or_expr)
+    else:
+        return name_or_expr
 
-        if subscripts_and_names is None:
-            continue
 
-        packing_info[arg.name] = _PackingInfo(
-                name=arg.name,
-                sep_shape=arg.sep_shape(),
-                subscripts_and_names=subscripts_and_names,
-                is_written=arg.name in
-                program[entrypoint].get_written_variables())
+@dataclass(frozen=True)
+class _ArgFindingEquation:
+    lhs: ExpressionT
+    rhs: ExpressionT
 
-    if packing_info:
-        return SeparateArrayPackingController(packing_info)
-    else:
-        return None
+    # Arg finding code is sorted by priority, lowest order first
+    order: int
 
-# }}}
+    based_on_names: FrozenSet[str]
+    require_names: bool
 
 
-# {{{ ExecutionWrapperGeneratorBase
-
 class ExecutionWrapperGeneratorBase(ABC):
     """
     A set of common methods for generating a wrapper
@@ -130,7 +132,7 @@ class ExecutionWrapperGeneratorBase(ABC):
     def python_dtype_str_inner(self, dtype):
         pass
 
-    def python_dtype_str(self, gen, numpy_dtype):
+    def python_dtype_str(self, gen: CodeGenerator, numpy_dtype):
         dtype_str = self.python_dtype_str_inner(numpy_dtype)
         try:
             return self.dtype_str_to_name[dtype_str]
@@ -147,176 +149,178 @@ class ExecutionWrapperGeneratorBase(ABC):
     # /!\ This code runs in a namespace controlled by the user.
     # Prefix all auxiliary variables with "_lpy".
 
-    # {{{ integer arg finding from shapes
-
-    def generate_integer_arg_finding_from_shapes(
-            self, gen, program, implemented_data_info):
-        # a mapping from integer argument names to a list of tuples
-        # (arg_name, expression), where expression is a
-        # unary function of kernel.arg_dict[arg_name]
-        # returning the desired integer argument.
-        iarg_to_sources = {}
+    # {{{ integer arg finding from array data
 
+    def generate_integer_arg_finding_from_array_data(
+            self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo
+            ) -> None:
         from loopy.kernel.data import ArrayArg
+        from loopy.kernel.array import get_strides
         from loopy.symbolic import DependencyMapper, StringifyMapper
         from loopy.diagnostic import ParameterFinderWarning
         dep_map = DependencyMapper()
 
-        from pymbolic import var
-        for arg in implemented_data_info:
-            if arg.arg_class is ArrayArg:
-                sym_shape = var(arg.name).attr("shape")
-                for axis_nr, shape_i in enumerate(arg.shape):
-                    if shape_i is None:
-                        continue
-
-                    deps = dep_map(shape_i)
-
-                    if len(deps) == 1:
-                        integer_arg_var, = deps
-
-                        if program.arg_dict[
-                                integer_arg_var.name].dtype.is_integral():
-                            from pymbolic.algorithm import solve_affine_equations_for
-                            try:
-                                # friggin' overkill :)
-                                iarg_expr = solve_affine_equations_for(
-                                        [integer_arg_var.name],
-                                        [(shape_i, sym_shape.index(axis_nr))]
-                                        )[integer_arg_var]
-                            except Exception as e:
-                                #from traceback import print_exc
-                                #print_exc()
-
-                                # went wrong? oh well
-                                from warnings import warn
-                                warn("Unable to generate code to automatically "
-                                        "find '%s' from the shape of '%s':\n%s"
-                                        % (integer_arg_var.name, arg.name, str(e)),
-                                        ParameterFinderWarning)
-                            else:
-                                iarg_to_sources.setdefault(integer_arg_var.name, [])\
-                                        .append((arg.name, iarg_expr))
-
-        gen("# {{{ find integer arguments from shapes")
-        gen("")
+        # {{{ find equations
+
+        equations: List[_ArgFindingEquation] = []
+
+        from pymbolic.primitives import If
+
+        for arg_name in kai.passed_arg_names:
+            arg = kernel.arg_dict[arg_name]
+            if isinstance(arg, ArrayArg):
+                assert arg.shape is not auto
+                if isinstance(arg.shape, tuple):
+                    for axis_nr, shape_i in enumerate(arg.shape):
+                        if shape_i is not None:
+                            equations.append(
+                                _ArgFindingEquation(
+                                    lhs=var(arg.name).attr("shape").index(axis_nr),
+                                    rhs=shape_i,
+                                    order=0,
+                                    based_on_names=frozenset({arg.name}),
+                                    require_names=True))
+
+                for axis_nr, stride_i in enumerate(get_strides(arg)):
+                    if stride_i is not None:
+                        equations.append(
+                                _ArgFindingEquation(
+                                    lhs=var("_lpy_even_div")(
+                                        var(arg.name).attr("strides").index(axis_nr),
+                                        arg.dtype.itemsize),
+                                    rhs=_str_to_expr(stride_i),
+                                    order=0,
+                                    based_on_names=frozenset({arg.name}),
+                                    require_names=True))
+
+                if arg.offset is not None:
+                    if not kernel.options.no_numpy:
+                        offset = var("getattr")(var(arg.name), var('"offset"'), 0)
+                    else:
+                        offset = var(arg.name).attr("offset")
 
-        for iarg_name, sources in iarg_to_sources.items():
-            gen("if %s is None:" % iarg_name)
-            with Indentation(gen):
-                if_stmt = "if"
-                for arg_name, value_expr in sources:
-                    gen(f"{if_stmt} {arg_name} is not None:")
-                    with Indentation(gen):
-                        gen("%s = %s"
-                                % (iarg_name, StringifyMapper()(value_expr)))
+                    offset = If(var(f"{arg.name} is None"), 0, offset)
 
-                    if_stmt = "elif"
+                    equations.append(
+                            _ArgFindingEquation(
+                                lhs=var("_lpy_even_div")(
+                                    offset, arg.dtype.itemsize),
+                                rhs=_str_to_expr(arg.offset),
 
-            gen("")
+                                # Argument finding from offsets should run last,
+                                # as it assumes a zero offset if a variable is
+                                # not passed. That should only be done if no
+                                # other approach yielded a value for the variable.
+                                order=1,
+                                based_on_names=frozenset(arg.name),
+                                require_names=False,
+                                ))
 
-        gen("# }}}")
-        gen("")
-
-    # }}}
+        # }}}
 
-    # {{{ integer arg finding from offsets
+        # {{{ regroup equations by unknown
 
-    def generate_integer_arg_finding_from_offsets(self, gen, kernel,
-            implemented_data_info):
-        options = kernel.options
+        unknown_to_equations: Dict[str, List[_ArgFindingEquation]] = {}
 
-        gen("# {{{ find integer arguments from offsets")
-        gen("")
+        for eqn in equations:
+            deps = dep_map(eqn.rhs)
 
-        for arg in implemented_data_info:
-            impl_array_name = arg.offset_for_name
-            if impl_array_name is not None:
-                gen("if %s is None:" % arg.name)
-                with Indentation(gen):
-                    gen("if %s is None:" % impl_array_name)
-                    with Indentation(gen):
-                        gen("# Output variable, we'll be allocating "
-                                "it, with zero offset.")
-                        gen("%s = 0" % arg.name)
-                    gen("else:")
-                    with Indentation(gen):
-                        if not options.no_numpy:
-                            gen('_lpy_offset = getattr(%s, "offset", 0)'
-                                    % impl_array_name)
-                        else:
-                            gen("_lpy_offset = %s.offset" % impl_array_name)
+            if len(deps) == 1:
+                unknown_var, = deps
+                unknown_to_equations.setdefault(unknown_var.name, []).append((eqn))
 
-                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
+        del equations
 
-                        if not options.skip_arg_checks:
-                            gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
-                                    % (arg.name, base_arg.dtype.itemsize))
+        # }}}
 
-                            gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
-                                    'not divisible by its dtype itemsize"'
-                                    % impl_array_name)
-                            gen("del _lpy_remdr")
-                        else:
-                            gen("%s = _lpy_offset // %d"
-                                    % (arg.name, base_arg.dtype.itemsize))
+        # {{{ generate arg finding code
 
-                        if not options.skip_arg_checks:
-                            gen("del _lpy_offset")
+        from pymbolic.algorithm import solve_affine_equations_for
+        from pymbolic.primitives import Variable
+        from pytools.codegen import CodeGenerator
 
-        gen("# }}}")
+        gen("# {{{ find integer arguments from array data")
         gen("")
 
-    # }}}
+        for unknown_name in sorted(unknown_to_equations):
+            unk_equations = sorted(unknown_to_equations[unknown_name],
+                    key=lambda eqn: eqn.order)
+            req_subgen = CodeGenerator()
+            not_req_subgen = CodeGenerator()
 
-    # {{{ integer arg finding from strides
+            seen_based_on_names: Set[FrozenSet[str]] = set()
 
-    def generate_integer_arg_finding_from_strides(
-            self, gen, kernel, implemented_data_info):
-        options = kernel.options
+            if_or_elif = "if"
 
-        gen("# {{{ find integer arguments from strides")
-        gen("")
+            for eqn in unk_equations:
+                try:
+                    # overkill :)
+                    value_expr = solve_affine_equations_for(
+                            [unknown_name],
+                            [(eqn.lhs, eqn.rhs)]
+                            )[Variable(unknown_name)]
+                except Exception as e:
+                    # went wrong? oh well
+                    from warnings import warn
+                    warn("Unable to generate code to automatically "
+                            f"find '{unknown_name}' "
+                            f"from '{', '.join(eqn.based_on_names)}':\n"
+                            f"{e}", ParameterFinderWarning)
+                    continue
+
+                # Do not use more than one bit of data from each of the
+                # 'based_on_names' to find each value, i.e. if a value can be
+                # found via shape and strides, only one of them suffices.
+                # This also helps because strides can be unreliable in the
+                # face of zero-length axes.
+                if eqn.based_on_names in seen_based_on_names:
+                    continue
+                seen_based_on_names.add(eqn.based_on_names)
+
+                if eqn.require_names:
+                    condition = " and ".join(
+                            f"{ary_name} is not None"
+                            for ary_name in eqn.based_on_names)
+                    req_subgen(f"{if_or_elif} {condition}:")
+                    with Indentation(req_subgen):
+                        req_subgen(
+                                f"{unknown_name} = {StringifyMapper()(value_expr)}")
+                    if_or_elif = "elif"
+
+                    req_subgen("")
+                else:
+                    not_req_subgen(
+                            f"{unknown_name} = {StringifyMapper()(value_expr)}")
 
-        for arg in implemented_data_info:
-            if arg.stride_for_name_and_axis is not None:
-                impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
+                    not_req_subgen("")
 
-                gen("if %s is None:" % arg.name)
+            if not_req_subgen.code:
+                gen(f"if {unknown_name} is None:")
                 with Indentation(gen):
-                    if not options.skip_arg_checks:
-                        gen("if %s is None:" % impl_array_name)
+                    gen.extend(not_req_subgen)
+
+                    if req_subgen.code:
+                        # still? try the req_subgen
+                        gen(f"if {unknown_name} is None:")
                         with Indentation(gen):
-                            gen("raise RuntimeError(\"required stride '%s' for "
-                                    "argument '%s' not given or deducible from "
-                                    'passed array")'
-                                    % (arg.name, impl_array_name))
-
-                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                    if not options.skip_arg_checks:
-                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
-                                % (arg.name, impl_array_name, stride_impl_axis,
-                                    base_arg.dtype.dtype.itemsize))
-
-                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
-                                ' is not divisible by its dtype itemsize"'
-                                % (stride_impl_axis, impl_array_name))
-                        gen("del _lpy_remdr")
-                    else:
-                        gen("%s = %s.strides[%d] // %d"
-                                % (arg.name,  impl_array_name, stride_impl_axis,
-                                    base_arg.dtype.itemsize))
+                            gen.extend(req_subgen)
+            elif req_subgen.code:
+                gen(f"if {unknown_name} is None:")
+                with Indentation(gen):
+                    gen.extend(req_subgen)
 
         gen("# }}}")
         gen("")
 
+        # }}}
+
     # }}}
 
     # {{{ check that value args are present
 
     def generate_value_arg_check(
-            self, gen, kernel, implemented_data_info):
+            self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo
+            ) -> None:
         if kernel.options.skip_arg_checks:
             return
 
@@ -325,8 +329,9 @@ class ExecutionWrapperGeneratorBase(ABC):
         gen("# {{{ check that value args are present")
         gen("")
 
-        for arg in implemented_data_info:
-            if not issubclass(arg.arg_class, ValueArg):
+        for arg_name in kai.passed_arg_names:
+            arg = kernel.arg_dict[arg_name]
+            if not isinstance(arg, ValueArg):
                 continue
 
             gen("if %s is None:" % arg.name)
@@ -342,14 +347,17 @@ class ExecutionWrapperGeneratorBase(ABC):
 
     # {{{ handle non numpy arguements
 
-    def handle_non_numpy_arg(self, gen, arg):
+    def handle_non_numpy_arg(self, gen: CodeGenerator, arg):
         raise NotImplementedError()
 
     # }}}
 
     # {{{ handle allocation of unspecified arguements
 
-    def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
+    def handle_alloc(
+            self, gen: CodeGenerator, arg: ArrayArg,
+            strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
+            skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguements for C-execution
         """
@@ -384,10 +392,12 @@ class ExecutionWrapperGeneratorBase(ABC):
     # {{{ arg setup
 
     def generate_arg_setup(
-            self, gen, kernel, implemented_data_info, options):
+            self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo,
+            ) -> Sequence[str]:
+        options = kernel.options
         import loopy as lp
 
-        from loopy.kernel.data import KernelArgument
+        from loopy.kernel.data import ImageArg
         from loopy.kernel.array import ArrayBase
         from loopy.symbolic import StringifyMapper
         from loopy.types import NumpyType
@@ -404,21 +414,11 @@ class ExecutionWrapperGeneratorBase(ABC):
 
         strify = StringifyMapper()
 
-        expect_no_more_arguments = False
+        for arg_name in kai.passed_arg_names:
+            arg = kernel.arg_dict[arg_name]
+            is_written = arg.name in kernel.get_written_variables()
 
-        for arg in implemented_data_info:
-            is_written = arg.base_name in kernel.get_written_variables()
-            kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
-
-            if not issubclass(arg.arg_class, KernelArgument):
-                expect_no_more_arguments = True
-                continue
-
-            if expect_no_more_arguments:
-                raise LoopyError("Further arguments encountered after arg info "
-                        "describing a global temporary variable")
-
-            if not issubclass(arg.arg_class, ArrayBase):
+            if not isinstance(arg, ArrayBase):
                 args.append(arg.name)
                 continue
 
@@ -428,15 +428,14 @@ class ExecutionWrapperGeneratorBase(ABC):
             if not options.no_numpy:
                 self.handle_non_numpy_arg(gen, arg)
 
-            if not options.skip_arg_checks and kernel_arg.is_input:
+            if not options.skip_arg_checks and arg.is_input:
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     gen("raise RuntimeError(\"input argument '%s' must "
                             'be supplied")' % arg.name)
                     gen("")
 
-            if (is_written
-                    and arg.arg_class is lp.ImageArg
+            if (is_written and isinstance(arg, ImageArg)
                     and not options.skip_arg_checks):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
@@ -455,8 +454,8 @@ class ExecutionWrapperGeneratorBase(ABC):
 
             # {{{ allocate written arrays, if needed
 
-            if kernel_arg.is_output \
-                    and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
+            if arg.is_output \
+                    and isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \
                     and arg.shape is not None \
                     and all(si is not None for si in arg.shape):
 
@@ -471,7 +470,7 @@ class ExecutionWrapperGeneratorBase(ABC):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     self.handle_alloc(
-                        gen, arg, kernel_arg, strify, options.skip_arg_checks)
+                        gen, arg, strify, options.skip_arg_checks)
                     gen("_lpy_made_by_loopy = True")
                     gen("")
 
@@ -479,7 +478,7 @@ class ExecutionWrapperGeneratorBase(ABC):
 
             # {{{ argument checking
 
-            if arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \
+            if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \
                     and not options.skip_arg_checks:
                 if possibly_made_by_loopy:
                     gen("if not _lpy_made_by_loopy:")
@@ -489,7 +488,7 @@ class ExecutionWrapperGeneratorBase(ABC):
                 with Indentation(gen):
                     gen("if %s.dtype != %s:"
                             % (arg.name, self.python_dtype_str(
-                                gen, kernel_arg.dtype.numpy_dtype)))
+                                gen, arg.dtype.numpy_dtype)))
                     with Indentation(gen):
                         gen("raise TypeError(\"dtype mismatch on argument '%s' "
                                 '(got: %%s, expected: %s)" %% %s.dtype)'
@@ -503,7 +502,9 @@ class ExecutionWrapperGeneratorBase(ABC):
                         else:
                             return strify(shape_axis)
 
-                    def strify_tuple(t):
+                    def strify_tuple(t: Optional[Tuple[ExpressionT, ...]]) -> str:
+                        if t is None:
+                            return "None"
                         if len(t) == 0:
                             return "()"
                         else:
@@ -512,21 +513,22 @@ class ExecutionWrapperGeneratorBase(ABC):
                                     for sa in t)
 
                     shape_mismatch_msg = (
-                            "raise TypeError(\"shape mismatch on argument '%s' "
+                            "raise ValueError(\"shape mismatch on argument '%s' "
                             '(got: %%s, expected: %%s)" '
                             "%% (%s.shape, %s))"
-                            % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
+                            % (arg.name, arg.name, strify_tuple(arg.shape)))
 
-                    if kernel_arg.shape is None:
+                    if arg.shape is None:
                         pass
 
-                    elif any(shape_axis is None for shape_axis in kernel_arg.shape):
+                    elif any(shape_axis is None for shape_axis in arg.shape):
+                        assert isinstance(arg.shape, tuple)
                         gen("if len(%s.shape) != %s:"
-                                % (arg.name, len(arg.unvec_shape)))
+                                % (arg.name, len(arg.shape)))
                         with Indentation(gen):
                             gen(shape_mismatch_msg)
 
-                        for i, shape_axis in enumerate(arg.unvec_shape):
+                        for i, shape_axis in enumerate(arg.shape):
                             if shape_axis is None:
                                 continue
 
@@ -537,18 +539,21 @@ class ExecutionWrapperGeneratorBase(ABC):
 
                     else:  # not None, no Nones in tuple
                         gen("if %s.shape != %s:"
-                                % (arg.name, strify(arg.unvec_shape)))
+                                % (arg.name, strify(arg.shape)))
                         with Indentation(gen):
                             gen(shape_mismatch_msg)
 
                     # }}}
 
-                    if arg.unvec_strides and kernel_arg.dim_tags:
-                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                        sym_strides = tuple(
-                                itemsize*s_i for s_i in arg.unvec_strides)
+                    from loopy.kernel.array import get_strides
+                    strides = get_strides(arg)
 
-                        ndim = len(arg.unvec_shape)
+                    if strides and arg.dim_tags and arg.shape is not None:
+                        assert isinstance(arg.shape, tuple)
+                        itemsize = arg.dtype.numpy_dtype.itemsize
+                        sym_strides = tuple(itemsize*s_i for s_i in strides)
+
+                        ndim = len(arg.shape)
                         shape = ["_lpy_shape_%d" % i for i in range(ndim)]
                         strides = ["_lpy_stride_%d" % i for i in range(ndim)]
 
@@ -561,14 +566,8 @@ class ExecutionWrapperGeneratorBase(ABC):
                                     shape, strides,
                                     [strify(s) for s in sym_strides]))
                         with Indentation(gen):
-                            gen("_lpy_got = tuple(stride "
-                                    "for (dim, stride) in zip(%s.shape, %s.strides) "
-                                    ")"
-                                    % (arg.name, arg.name))
-                            gen("_lpy_expected = tuple(stride "
-                                    "for (dim, stride) in zip(%s.shape, %s) "
-                                    ")"
-                                    % (arg.name, strify_tuple(sym_strides)))
+                            gen(f"_lpy_got = {arg.name}.strides")
+                            gen(f"_lpy_expected = {strify_tuple(sym_strides)}")
 
                             gen('raise ValueError("strides mismatch on '
                                     "argument '%s' "
@@ -576,7 +575,7 @@ class ExecutionWrapperGeneratorBase(ABC):
                                     "%% (_lpy_got, _lpy_expected))"
                                     % arg.name)
 
-                    if not arg.allows_offset:
+                    if not arg.offset:
                         gen("if hasattr({}, 'offset') and {}.offset:".format(
                                 arg.name, arg.name))
                         with Indentation(gen):
@@ -592,7 +591,7 @@ class ExecutionWrapperGeneratorBase(ABC):
                 gen("del _lpy_made_by_loopy")
                 gen("")
 
-            if arg.arg_class in [lp.ArrayArg, lp.ConstantArg]:
+            if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)):
                 args.append(self.get_arg_pass(arg))
             else:
                 args.append("%s" % arg.name)
@@ -623,17 +622,16 @@ class ExecutionWrapperGeneratorBase(ABC):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args,
-            kernel, implemented_data_info):
+    def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
+            kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
         raise NotImplementedError()
 
     # }}}
 
     # {{{ output
 
-    def generate_output_handler(
-            self, gen, options, kernel, implemented_data_info):
-
+    def generate_output_handler(self, gen: CodeGenerator,
+            kernel: LoopKernel, kai: KernelArgInfo) -> None:
         raise NotImplementedError()
 
     # }}}
@@ -653,16 +651,17 @@ class ExecutionWrapperGeneratorBase(ABC):
             kernel
         """
 
-        options = program[entrypoint].options
-        implemented_data_info = codegen_result.implemented_data_infos[entrypoint]
+        kernel = program[entrypoint]
+        options = kernel.options
+
+        from loopy.schedule.tools import get_kernel_arg_info
+        kai = get_kernel_arg_info(kernel)
 
-        from loopy.kernel.data import KernelArgument
         gen = PythonFunctionGenerator(
                 "invoke_%s_loopy_kernel" % entrypoint,
                 self.system_args + [
-                    "%s=None" % idi.name
-                    for idi in implemented_data_info
-                    if issubclass(idi.arg_class, KernelArgument)
+                    "%s=None" % arg_name
+                    for arg_name in kai.passed_arg_names
                     ])
 
         self.target_specific_preamble(gen)
@@ -672,25 +671,18 @@ class ExecutionWrapperGeneratorBase(ABC):
 
         self.initialize_system_args(gen)
 
-        self.generate_integer_arg_finding_from_shapes(
-            gen, program[entrypoint], implemented_data_info)
-        self.generate_integer_arg_finding_from_offsets(
-            gen, program[entrypoint], implemented_data_info)
-        self.generate_integer_arg_finding_from_strides(
-            gen, program[entrypoint], implemented_data_info)
-        self.generate_value_arg_check(
-            gen, program[entrypoint], implemented_data_info)
-        args = self.generate_arg_setup(
-            gen, program[entrypoint], implemented_data_info, options)
+        self.generate_integer_arg_finding_from_array_data(
+                gen, program[entrypoint], kai)
+        self.generate_value_arg_check(gen, program[entrypoint], kai)
+        args = self.generate_arg_setup(gen, program[entrypoint], kai)
 
         #FIXME: should we make this as a dict as well.
         host_program_name = codegen_result.host_programs[entrypoint].name
 
-        self.generate_invocation(gen, host_program_name, args,
-                program[entrypoint], implemented_data_info)
+        self.generate_invocation(gen, program[entrypoint], kai,
+                host_program_name, args)
 
-        self.generate_output_handler(gen, options, program[entrypoint],
-                implemented_data_info)
+        self.generate_output_handler(gen, program[entrypoint], kai)
 
         if options.write_wrapper:
             output = gen.get()
@@ -710,14 +702,6 @@ class ExecutionWrapperGeneratorBase(ABC):
 # }}}
 
 
-class _KernelInfo(ImmutableRecord):
-    pass
-
-
-class _Kernels:
-    pass
-
-
 typed_and_scheduled_cache = WriteOncePersistentDict(
         "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
@@ -737,18 +721,13 @@ class KernelExecutorBase:
     .. automethod:: __init__
     .. automethod:: __call__
     """
+    packing_controller: Optional[SeparateArrayPackingController]
 
-    def __init__(self, program, entrypoint):
-        """
-        :arg kernel: a loopy.LoopKernel
-        """
-
-        self.program = program
+    def __init__(self, t_unit: TranslationUnit, entrypoint: str):
+        self.t_unit = t_unit
         self.entrypoint = entrypoint
 
-        self.packing_controller = make_packing_controller(program, entrypoint)
-
-        kernel = self.program[entrypoint]
+        kernel = self.t_unit[entrypoint]
         self.output_names = set(arg.name for arg in kernel.args if arg.is_output)
 
         from loopy import ArrayArg
@@ -759,6 +738,23 @@ class KernelExecutorBase:
         self.has_runtime_typed_args = any(
             arg.dtype is None for arg in kernel.args)
 
+        # We're doing this ahead of time to learn about array separation.
+        # This will be done again as part of preprocessing below, and we're
+        # betting that it happens consistently both times. (No reason it wouldn't,
+        # but it is done redundantly.) We can't *use* the result of this
+        # because we need to do the 'official' array separation after type
+        # inference has completed.
+        from loopy.preprocess import make_arrays_for_sep_arrays
+        self.separated_entry_knl = make_arrays_for_sep_arrays(
+                self.t_unit[self.entrypoint])
+
+        self.sep_info = self.separated_entry_knl._separation_info()
+        if self.sep_info:
+            self.packing_controller = SeparateArrayPackingController(self.sep_info)
+        else:
+            self.packing_controller = None
+            return None
+
     def check_for_required_array_arguments(self, input_args):
         # Formerly, the first exception raised when a required argument is not
         # passed was often at type inference. This exists to raise a more meaningful
@@ -768,7 +764,7 @@ class KernelExecutorBase:
         # and links therin for context.
         if not self.input_array_names <= set(input_args):
             missing_args = self.input_array_names - set(input_args)
-            kernel = self.program[self.entrypoint]
+            kernel = self.t_unit[self.entrypoint]
             raise LoopyError(
                 f"Kernel {kernel.name}() missing required array input arguments: "
                 f"{', '.join(missing_args)}. "
@@ -776,44 +772,48 @@ class KernelExecutorBase:
                 "your argument.")
 
     def get_typed_and_scheduled_translation_unit_uncached(
-            self, entrypoint, arg_to_dtype_set):
-        from loopy.kernel.tools import add_dtypes
-        from loopy.kernel import KernelState
-        from loopy.translation_unit import resolve_callables
-
-        program = resolve_callables(self.program)
-
-        if arg_to_dtype_set:
-            var_to_dtype = {}
-            entry_knl = program[entrypoint]
-            for var, dtype in arg_to_dtype_set:
-                if var in entry_knl.impl_arg_to_arg:
-                    dest_name = entry_knl.impl_arg_to_arg[var].name
-                else:
-                    dest_name = var
+            self, entrypoint, arg_to_dtype: Optional[Map[str, LoopyType]]
+            ) -> TranslationUnit:
+        t_unit = self.t_unit
+
+        if arg_to_dtype:
+            entry_knl = t_unit[entrypoint]
 
-                var_to_dtype[dest_name] = dtype
+            # FIXME: This is not so nice. This transfers types from the
+            # subarrays of sep-tagged arrays to the 'main' array, because
+            # type inference fails otherwise.
+            with arg_to_dtype.mutate() as mm:
+                for name, sep_info in self.sep_info.items():
+                    if entry_knl.arg_dict[name].dtype is None:
+                        for sep_name in sep_info.subarray_names.values():
+                            if sep_name in arg_to_dtype:
+                                mm.set(name, arg_to_dtype[sep_name])
+                                del mm[sep_name]
 
-            program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype))
+                arg_to_dtype = mm.finish()
+
+            from loopy.kernel.tools import add_dtypes
+            t_unit = t_unit.with_kernel(add_dtypes(entry_knl, arg_to_dtype))
 
             from loopy.type_inference import infer_unknown_types
-            program = infer_unknown_types(program, expect_completion=True)
+            t_unit = infer_unknown_types(t_unit, expect_completion=True)
 
-        if program.state < KernelState.LINEARIZED:
+        if t_unit.state < KernelState.PREPROCESSED:
             from loopy.preprocess import preprocess_program
-            program = preprocess_program(program)
+            t_unit = preprocess_program(t_unit)
 
-            from loopy.schedule import get_one_linearized_kernel
-            for e in program.entrypoints:
-                program = program.with_kernel(
-                    get_one_linearized_kernel(program[e], program.callables_table))
+        if t_unit.state < KernelState.LINEARIZED:
+            from loopy.schedule import linearize
+            t_unit = linearize(t_unit)
 
-        return program
+        return t_unit
 
-    def get_typed_and_scheduled_translation_unit(self, entrypoint, arg_to_dtype_set):
+    def get_typed_and_scheduled_translation_unit(
+            self, entrypoint: str, arg_to_dtype: Optional[Map[str, LoopyType]]
+            ) -> TranslationUnit:
         from loopy import CACHING_ENABLED
 
-        cache_key = (type(self).__name__, self.program, arg_to_dtype_set)
+        cache_key = (type(self).__name__, self.t_unit, arg_to_dtype)
 
         if CACHING_ENABLED:
             try:
@@ -822,29 +822,24 @@ class KernelExecutorBase:
                 pass
 
         logger.debug("%s: typed-and-scheduled cache miss" %
-                self.program.entrypoints)
+                self.t_unit.entrypoints)
 
         kernel = self.get_typed_and_scheduled_translation_unit_uncached(entrypoint,
-                arg_to_dtype_set)
+                arg_to_dtype)
 
         if CACHING_ENABLED:
             typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
 
         return kernel
 
-    def arg_to_dtype_set(self, kwargs):
-        kwargs = kwargs.copy()
+    def arg_to_dtype(self, kwargs) -> Optional[Map[str, LoopyType]]:
         if not self.has_runtime_typed_args:
             return None
 
-        impl_arg_to_arg = self.program[self.entrypoint].impl_arg_to_arg
+        arg_dict = self.separated_entry_knl.arg_dict
         arg_to_dtype = {}
         for arg_name, val in kwargs.items():
-            arg = impl_arg_to_arg.get(arg_name, None)
-
-            if arg is None:
-                # offsets, strides and such
-                continue
+            arg = arg_dict[arg_name]
 
             if arg.dtype is None and val is not None:
                 try:
@@ -852,9 +847,9 @@ class KernelExecutorBase:
                 except AttributeError:
                     pass
                 else:
-                    arg_to_dtype[arg_name] = dtype
+                    arg_to_dtype[arg_name] = NumpyType(dtype)
 
-        return frozenset(arg_to_dtype.items())
+        return Map(arg_to_dtype)
 
     # {{{ debugging aids
 
@@ -863,20 +858,9 @@ class KernelExecutorBase:
             code = self.get_code(entrypoint, arg_to_dtype)
         return get_highlighted_code(code)
 
-    def get_code(self, entrypoint, arg_to_dtype=None):
-        def process_dtype(dtype):
-            if isinstance(dtype, type) and issubclass(dtype, np.generic):
-                dtype = np.dtype(dtype)
-            if isinstance(dtype, np.dtype):
-                from loopy.types import NumpyType
-                dtype = NumpyType(dtype, self.program.target)
-
-            return dtype
-
-        if arg_to_dtype is not None:
-            arg_to_dtype = frozenset(
-                    (k, process_dtype(v)) for k, v in arg_to_dtype.items())
-
+    def get_code(
+            self, entrypoint: str,
+            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> str:
         kernel = self.get_typed_and_scheduled_translation_unit(
                 entrypoint, arg_to_dtype)
 
@@ -887,10 +871,10 @@ class KernelExecutorBase:
     def get_invoker_uncached(self, program, entrypoint, *args):
         raise NotImplementedError()
 
-    def get_invoker(self, program, entrypoint, *args):
+    def get_invoker(self, t_unit, entrypoint, *args):
         from loopy import CACHING_ENABLED
 
-        cache_key = (self.__class__.__name__, (program, entrypoint))
+        cache_key = (self.__class__.__name__, (t_unit, entrypoint))
 
         if CACHING_ENABLED:
             try:
@@ -900,7 +884,7 @@ class KernelExecutorBase:
 
         logger.debug("%s: invoker cache miss" % entrypoint)
 
-        invoker = self.get_invoker_uncached(program, entrypoint, *args)
+        invoker = self.get_invoker_uncached(t_unit, entrypoint, *args)
 
         if CACHING_ENABLED:
             invoker_cache.store_if_not_present(cache_key, invoker)
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 3c6ff52b052035a0680146df3b7272751b699373..9974803c5c08ea4696f91b3e7ba31213f23c35c7 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -24,17 +24,25 @@ THE SOFTWARE.
 """
 
 
+from typing import cast, Tuple
+
 import numpy as np  # noqa
+import pymbolic.primitives as p
+from pymbolic import var
+from pymbolic.mapper.stringifier import PREC_NONE
+from pytools import memoize_method
+from cgen import Generable, Declarator, Const
+
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from loopy.diagnostic import LoopyError
 from loopy.symbolic import Literal
-from pymbolic import var
-import pymbolic.primitives as p
-from loopy.kernel.data import AddressSpace
-from pymbolic.mapper.stringifier import PREC_NONE
-
-from pytools import memoize_method
+from loopy.schedule import CallKernel
+from loopy.typing import ExpressionT
+from loopy.types import LoopyType
+from loopy.kernel.data import AddressSpace, TemporaryVariable, ArrayArg
+from loopy.codegen import CodeGenerationState
+from loopy.codegen.result import CodeGenerationResult
 
 
 # {{{ expression mapper
@@ -110,7 +118,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 from loopy.kernel.array import get_access_info
                 from pymbolic import evaluate
 
-                access_info = get_access_info(self.kernel.target, ary, expr.index,
+                access_info = get_access_info(self.kernel, ary, expr.index,
                     lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                     self.codegen_state.vectorization_info)
 
@@ -159,15 +167,6 @@ class ISPCTarget(CFamilyTarget):
     Intel CPUs with wide vector units.
     """
 
-    def __init__(self, occa_mode=False):
-        """
-        :arg occa_mode: Whether to modify the generated call signature to
-            be compatible with OCCA
-        """
-        self.occa_mode = occa_mode
-
-        super().__init__()
-
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
 
@@ -201,43 +200,36 @@ class ISPCTarget(CFamilyTarget):
 
 
 class ISPCASTBuilder(CFamilyASTBuilder):
-    def _arg_names_and_decls(self, codegen_state):
-        implemented_data_info = codegen_state.implemented_data_info
-        arg_names = [iai.name for iai in implemented_data_info]
-
-        arg_decls = [
-                self.idi_to_cgen_declarator(codegen_state.kernel, idi)
-                for idi in implemented_data_info]
-
-        # {{{ occa compatibility hackery
-
-        from cgen import Value
-        if self.target.occa_mode:
-            from cgen import ArrayOf, Const
-            from cgen.ispc import ISPCUniform
-
-            arg_decls = [
-                    Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
-                    Const(ISPCUniform(Value("int", "o1"))),
-                    Const(ISPCUniform(Value("int", "o2"))),
-                    Const(ISPCUniform(Value("int", "o3"))),
-                    ] + arg_decls
-            arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names
-
-        # }}}
-
-        return arg_names, arg_decls
-
     # {{{ top-level codegen
 
-    def get_function_declaration(self, codegen_state, codegen_result,
-            schedule_index):
+    def get_function_declaration(self, codegen_state: CodeGenerationState,
+            codegen_result: CodeGenerationResult, schedule_index: int) -> Generable:
         name = codegen_result.current_program(codegen_state).name
+        kernel = codegen_state.kernel
+
+        assert codegen_state.kernel.linearization is not None
+        subkernel_name = cast(
+                        CallKernel,
+                        codegen_state.kernel.linearization[schedule_index]
+                        ).kernel_name
 
         from cgen import (FunctionDeclaration, Value)
         from cgen.ispc import ISPCExport, ISPCTask
 
-        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
+        if codegen_state.is_entrypoint:
+            # subkernel launches occur only as part of entrypoint kernels for now
+            from loopy.schedule.tools import get_subkernel_arg_info
+            skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name)
+            passed_names = skai.passed_names
+            written_names = skai.written_names
+        else:
+            passed_names = [arg.name for arg in kernel.args]
+            written_names = kernel.get_written_variables()
+
+        arg_decls = [self.arg_to_cgen_declarator(
+                            kernel, arg_name,
+                            is_written=arg_name in written_names)
+                        for arg_name in passed_names]
 
         if codegen_state.is_generating_device_code:
             result = ISPCTask(
@@ -253,9 +245,11 @@ class ISPCASTBuilder(CFamilyASTBuilder):
         from loopy.target.c import FunctionDeclarationWrapper
         return FunctionDeclarationWrapper(result)
 
-    # }}}
-
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+    def get_kernel_call(self, codegen_state: CodeGenerationState,
+            subkernel_name: str,
+            gsize: Tuple[ExpressionT, ...],
+            lsize: Tuple[ExpressionT, ...]) -> Generable:
+        kernel = codegen_state.kernel
         ecm = self.get_expression_to_code_mapper(codegen_state)
 
         from pymbolic.mapper.stringifier import PREC_NONE
@@ -267,19 +261,27 @@ class ISPCASTBuilder(CFamilyASTBuilder):
                         "assert(programCount == (%s))"
                         % ecm(lsize[0], PREC_NONE)))
 
-        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
+        if codegen_state.is_entrypoint:
+            # subkernel launches occur only as part of entrypoint kernels for now
+            from loopy.schedule.tools import get_subkernel_arg_info
+            skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name)
+            passed_names = skai.passed_names
+        else:
+            passed_names = [arg.name for arg in kernel.args]
 
         from cgen.ispc import ISPCLaunch
         result.append(
                 ISPCLaunch(
                     tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize),
                     "{}({})".format(
-                        name,
-                        ", ".join(arg_names)
+                        subkernel_name,
+                        ", ".join(passed_names)
                         )))
 
         return Block(result)
 
+    # }}}
+
     # {{{ code generation guts
 
     def get_expression_to_c_expression_mapper(self, codegen_state):
@@ -302,17 +304,43 @@ class ISPCASTBuilder(CFamilyASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info):
-        from loopy.target.c import POD  # uses the correct complex type
-        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
+    # }}}
+
+    # {{{ declarators
+
+    def get_value_arg_declaraotor(
+            self, name: str, dtype: LoopyType, is_written: bool) -> Declarator:
+        from cgen.ispc import ISPCUniform
+        return ISPCUniform(super().get_value_arg_declaraotor(
+                name, dtype, is_written))
+
+    def get_array_arg_declarator(
+            self, arg: ArrayArg, is_written: bool) -> Declarator:
+        # FIXME restrict?
+        from cgen.ispc import ISPCUniformPointer, ISPCUniform
+        decl = ISPCUniform(
+                ISPCUniformPointer(self.get_array_base_declarator(arg)))
 
-        shape = decl_info.shape
+        if not is_written:
+            decl = Const(decl)
+
+        return decl
+
+    def get_temporary_var_declarator(self,
+            codegen_state: CodeGenerationState,
+            temp_var: TemporaryVariable) -> Declarator:
+        temp_var_decl = self.get_array_base_declarator(temp_var)
+
+        shape = temp_var.shape
+
+        assert isinstance(shape, tuple)
 
         if temp_var.address_space == AddressSpace.PRIVATE:
             # FIXME: This is a pretty coarse way of deciding what
             # private temporaries get duplicated. Refine? (See also
             # above in expr to code mapper)
-            _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs()
+            _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs(
+                    codegen_state.callables_table)
             shape = lsize + shape
 
         if shape:
@@ -325,50 +353,9 @@ class ISPCASTBuilder(CFamilyASTBuilder):
 
         return temp_var_decl
 
-    def wrap_temporary_decl(self, decl, scope):
-        from cgen.ispc import ISPCUniform
-        return ISPCUniform(decl)
-
-    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
-        from loopy.target.c import POD  # uses the correct complex type
-        from cgen import Const
-        from cgen.ispc import ISPCUniformPointer, ISPCUniform
-
-        arg_decl = ISPCUniformPointer(POD(self, dtype, name))
-
-        if not is_written:
-            arg_decl = Const(arg_decl)
-
-        arg_decl = ISPCUniform(arg_decl)
-
-        return arg_decl
-
-    def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from warnings import warn
-        warn("get_global_arg_decl is deprecated use get_array_arg_decl "
-                "instead.", DeprecationWarning, stacklevel=2)
-        return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape,
-                dtype, is_written)
-
-    def get_value_arg_decl(self, name, shape, dtype, is_written):
-        result = super().get_value_arg_decl(
-                name, shape, dtype, is_written)
-
-        from cgen import Reference, Const
-        was_const = isinstance(result, Const)
-
-        if was_const:
-            result = result.subdecl
-
-        if self.target.occa_mode:
-            result = Reference(result)
-
-        if was_const:
-            result = Const(result)
-
-        from cgen.ispc import ISPCUniform
-        return ISPCUniform(result)
+    # }}}
 
+    # {{{ emit_...
     def emit_assignment(self, codegen_state, insn):
         kernel = codegen_state.kernel
         ecm = codegen_state.expression_to_code_mapper
@@ -404,7 +391,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
             index_tuple = tuple(
                     simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
 
-            access_info = get_access_info(kernel.target, ary, index_tuple,
+            access_info = get_access_info(kernel, ary, index_tuple,
                     lambda expr: evaluate(expr, codegen_state.var_subst_map),
                     codegen_state.vectorization_info)
 
@@ -505,6 +492,7 @@ class ISPCASTBuilder(CFamilyASTBuilder):
                     PREC_NONE, "i"),
                 "++%s" % iname,
                 inner)
+
     # }}}
 
 
diff --git a/loopy/target/numba.py b/loopy/target/numba.py
deleted file mode 100644
index 2df81ec1f332be87d8ca361480a37b68b369b56f..0000000000000000000000000000000000000000
--- a/loopy/target/numba.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""Python host AST builder for integration with PyOpenCL."""
-
-
-__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from pytools import memoize_method
-
-from loopy.target.python import ExpressionToPythonMapper, PythonASTBuilderBase
-from loopy.target import TargetBase, DummyHostASTBuilder
-
-from loopy.diagnostic import LoopyWarning
-
-
-# {{{ base numba
-
-def _base_numba_preamble_generator(preamble_info):
-    yield ("06_numba_imports", """
-            import numba as _lpy_numba
-            """)
-
-
-class NumbaBaseASTBuilder(PythonASTBuilderBase):
-    def preamble_generators(self):
-        return (
-                super().preamble_generators() + [
-                    _base_numba_preamble_generator
-                    ])
-
-    def get_function_definition(self, codegen_state, codegen_result,
-            schedule_index,
-            function_decl, function_body):
-
-        assert function_decl is None
-
-        from genpy import Function
-        return Function(
-                codegen_result.current_program(codegen_state).name,
-                [idi.name for idi in codegen_state.implemented_data_info],
-                function_body,
-                decorators=self.get_python_function_decorators())
-
-    def get_python_function_decorators(self):
-        return ()
-
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
-        from pymbolic.mapper.stringifier import PREC_NONE
-        from genpy import Statement
-
-        ecm = self.get_expression_to_code_mapper(codegen_state)
-        implemented_data_info = codegen_state.implemented_data_info
-
-        return Statement(
-            "{}[{}, {}]({})".format(
-                name,
-                ecm(gsize, PREC_NONE),
-                ecm(lsize, PREC_NONE),
-                ", ".join(idi.name for idi in implemented_data_info)
-                ))
-
-
-class NumbaJITASTBuilder(NumbaBaseASTBuilder):
-    def get_python_function_decorators(self):
-        return ("@_lpy_numba.jit",)
-
-
-class NumbaTarget(TargetBase):
-    """A target for plain Python as understood by Numba, without any parallel extensions.
-    """
-
-    def __init__(self):
-        from warnings import warn
-        warn("The Numba targets are not yet feature-complete",
-                LoopyWarning, stacklevel=2)
-
-    def split_kernel_at_global_barriers(self):
-        return False
-
-    def get_host_ast_builder(self):
-        return DummyHostASTBuilder(self)
-
-    def get_device_ast_builder(self):
-        return NumbaJITASTBuilder(self)
-
-    # {{{ types
-
-    @memoize_method
-    def get_dtype_registry(self):
-        from loopy.target.c import DTypeRegistryWrapper
-        from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_registry_with_c_types)
-        result = DTypeRegistry()
-        fill_registry_with_c_types(result, respect_windows=False,
-                include_bool=True)
-        return DTypeRegistryWrapper(result)
-
-    def is_vector_dtype(self, dtype):
-        return False
-
-    def get_vector_dtype(self, base, count):
-        raise KeyError()
-
-    def get_or_register_dtype(self, names, dtype=None):
-        # These kind of shouldn't be here.
-        return self.get_dtype_registry().get_or_register_dtype(names, dtype)
-
-    def dtype_to_typename(self, dtype):
-        # These kind of shouldn't be here.
-        return self.get_dtype_registry().dtype_to_ctype(dtype)
-
-    # }}}
-
-# }}}
-
-
-# {{{ numba.cuda
-
-class NumbaCudaExpressionToPythonMapper(ExpressionToPythonMapper):
-    _GRID_AXES = "xyz"
-
-    def map_group_hw_index(self, expr, enclosing_prec):
-        return "_lpy_ncu.blockIdx.%s" % self._GRID_AXES[expr.axis]
-
-    def map_local_hw_index(self, expr, enclosing_prec):
-        return "_lpy_ncu.threadIdx.%s" % self._GRID_AXES[expr.axis]
-
-
-def _cuda_numba_preamble_generator(preamble_info):
-    yield ("06_import_numba_cuda", """
-            import numba.cuda as _lpy_ncu
-            """)
-
-
-class NumbaCudaASTBuilder(NumbaBaseASTBuilder):
-    def preamble_generators(self):
-        return (
-                super().preamble_generators() + [
-                    _cuda_numba_preamble_generator
-                    ])
-
-    def get_python_function_decorators(self):
-        return ("@_lpy_ncu.jit",)
-
-    def get_expression_to_code_mapper(self, codegen_state):
-        return NumbaCudaExpressionToPythonMapper(codegen_state)
-
-
-class NumbaCudaTarget(TargetBase):
-    """A target for Numba with CUDA extensions.
-    """
-
-    host_program_name_suffix = ""
-    device_program_name_suffix = "_inner"
-
-    def __init__(self):
-        from warnings import warn
-        warn("The Numba target is not yet feature-complete",
-                LoopyWarning, stacklevel=2)
-
-    def split_kernel_at_global_barriers(self):
-        return True
-
-    def get_host_ast_builder(self):
-        return NumbaBaseASTBuilder(self)
-
-    def get_device_ast_builder(self):
-        return NumbaCudaASTBuilder(self)
-
-    # {{{ types
-
-    @memoize_method
-    def get_dtype_registry(self):
-        from loopy.target.c import DTypeRegistryWrapper
-        from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_registry_with_c_types)
-        result = DTypeRegistry()
-        fill_registry_with_c_types(result, respect_windows=False,
-                include_bool=True)
-        return DTypeRegistryWrapper(result)
-
-    def is_vector_dtype(self, dtype):
-        return False
-
-    def get_vector_dtype(self, base, count):
-        raise KeyError()
-
-    def get_or_register_dtype(self, names, dtype=None):
-        # These kind of shouldn't be here.
-        return self.get_dtype_registry().get_or_register_dtype(names, dtype)
-
-    def dtype_to_typename(self, dtype):
-        # These kind of shouldn't be here.
-        return self.get_dtype_registry().dtype_to_ctype(dtype)
-
-    # }}}
-
-# }}}
-
-# vim: foldmethod=marker
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 437dd66818e4b11dab784851cdaba631007eafb0..d9b23670ea92d7f1576a7a0f313286e3eade4196 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -24,16 +24,18 @@ THE SOFTWARE.
 """
 
 import numpy as np
+from pymbolic import var
+from pytools import memoize_method
+from cgen import Declarator
 
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
-from pytools import memoize_method
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper
-from loopy.kernel.data import AddressSpace
+from loopy.kernel.array import VectorArrayDimTag, FixedStrideArrayDimTag, ArrayBase
+from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg
 from loopy.kernel.function_interface import ScalarCallable
-from pymbolic import var
 
 
 # {{{ dtype registry wrappers
@@ -668,8 +670,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
     # }}}
 
-    # {{{ code generation guts
-
     def get_expression_to_c_expression_mapper(self, codegen_state):
         return ExpressionToOpenCLCExpressionMapper(codegen_state)
 
@@ -695,68 +695,71 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def wrap_temporary_decl(self, decl, scope):
-        if scope == AddressSpace.LOCAL:
-            from cgen.opencl import CLLocal
+    # {{{ declarators
+
+    def wrap_decl_for_address_space(
+            self, decl: Declarator, address_space: AddressSpace) -> Declarator:
+        from cgen.opencl import CLGlobal, CLLocal
+        if address_space == AddressSpace.GLOBAL:
+            return CLGlobal(decl)
+        elif address_space == AddressSpace.LOCAL:
             return CLLocal(decl)
-        elif scope == AddressSpace.PRIVATE:
+        elif address_space == AddressSpace.PRIVATE:
             return decl
         else:
-            raise ValueError("unexpected temporary variable scope: %s"
-                    % scope)
+            raise ValueError("unexpected temporary variable address space: %s"
+                    % address_space)
+
+    def wrap_global_constant(self, decl: Declarator) -> Declarator:
+        from cgen.opencl import CLGlobal, CLConstant
+        assert isinstance(decl, CLGlobal)
+        decl = decl.subdecl
 
-    def wrap_global_constant(self, decl):
-        from cgen.opencl import CLConstant
         return CLConstant(decl)
 
-    def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written):
-        from cgen.opencl import CLGlobal, CLLocal
-        from loopy.kernel.data import AddressSpace
+    # duplicated in CUDA, update there if updating here
+    def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
+        dtype = ary.dtype
 
-        if mem_address_space == AddressSpace.LOCAL:
-            return CLLocal(super().get_array_arg_decl(
-                name, mem_address_space, shape, dtype, is_written))
-        elif mem_address_space == AddressSpace.PRIVATE:
-            return super().get_array_arg_decl(
-                name, mem_address_space, shape, dtype, is_written)
-        elif mem_address_space == AddressSpace.GLOBAL:
-            return CLGlobal(super().get_array_arg_decl(
-                name, mem_address_space, shape, dtype, is_written))
-        else:
-            raise ValueError("unexpected array argument scope: %s"
-                    % mem_address_space)
+        vec_size = ary.vector_size(self.target)
+        if vec_size > 1:
+            dtype = self.target.vector_dtype(dtype, vec_size)
 
-    def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.kernel.data import AddressSpace
-        from warnings import warn
-        warn("get_global_arg_decl is deprecated use get_array_arg_decl "
-                "instead.", DeprecationWarning, stacklevel=2)
+        if ary.dim_tags:
+            for dim_tag in ary.dim_tags:
+                if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)):
+                    # we're OK with those
+                    pass
+
+                else:
+                    raise NotImplementedError(
+                        f"{type(self).__name__} does not understand axis tag "
+                        f"'{type(dim_tag)}.")
+
+        from loopy.target.c import POD
+        return POD(self, dtype, ary.name)
+
+    def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator:
+        from cgen import RestrictPointer
+        from cgen.opencl import CLConstant
 
-        return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape,
-                dtype, is_written)
+        # constant *is* an address space as far as CL is concerned, do not re-wrap
+        return CLConstant(RestrictPointer(self.get_array_base_declarator(
+                arg)))
 
-    def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
+    def get_image_arg_declarator(
+            self, arg: ImageArg, is_written: bool) -> Declarator:
         if is_written:
             mode = "w"
         else:
             mode = "r"
 
         from cgen.opencl import CLImage
-        return CLImage(num_target_axes, mode, name)
-
-    def get_constant_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.target.c import POD  # uses the correct complex type
-        from cgen import RestrictPointer, Const
-        from cgen.opencl import CLConstant
-
-        arg_decl = RestrictPointer(POD(self, dtype, name))
+        return CLImage(arg.num_target_axes(), mode, arg.name)
 
-        if not is_written:
-            arg_decl = Const(arg_decl)
-
-        return CLConstant(arg_decl)
+    # }}}
 
-    # {{{
+    # {{{ atomics
 
     def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
@@ -766,10 +769,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
         return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context)
 
-    # }}}
-
-    # {{{ code generation for atomic update
-
     def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
         from pymbolic.mapper.stringifier import PREC_NONE
@@ -881,8 +880,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
     # }}}
 
-    # }}}
-
 # }}}
 
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 01dc024c694244c3461944bf79ffdb962216a146..66dd9ae3f255981df67f59ec202881f4e8f77928 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -22,12 +22,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from typing import Sequence, Mapping, Tuple, Dict, List, Union
+
 import numpy as np
 import pymbolic.primitives as p
+import genpy
 
 from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder,
         ExpressionToOpenCLCExpressionMapper)
 from loopy.target.python import PythonASTBuilderBase
+from loopy.kernel import LoopKernel
 from loopy.types import NumpyType
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from warnings import warn
@@ -527,20 +531,22 @@ class PyOpenCLTarget(OpenCLTarget):
 
 # {{{ host code: value arg setup
 
-def generate_value_arg_setup(kernel, implemented_data_info):
+def generate_value_arg_setup(
+        kernel: LoopKernel, passed_names: Sequence[str]
+        ) -> Tuple[genpy.Generable, Mapping[int, int], int]:
     options = kernel.options
 
     import loopy as lp
     from loopy.kernel.array import ArrayBase
 
     cl_arg_idx = 0
-    arg_idx_to_cl_arg_idx = {}
+    arg_idx_to_cl_arg_idx: Dict[int, int] = {}
 
     fp_arg_count = 0
 
     from genpy import If, Raise, Statement as S, Suite
 
-    result = []
+    result: List[str] = []
     gen = result.append
 
     buf_indices_and_args = []
@@ -557,11 +563,18 @@ def generate_value_arg_setup(kernel, implemented_data_info):
             buf_indices_and_args.append(arg_idx)
             buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
 
-    for arg_idx, idi in enumerate(implemented_data_info):
+    for arg_idx, passed_name in enumerate(passed_names):
         arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
 
-        if not issubclass(idi.arg_class, lp.ValueArg):
-            assert issubclass(idi.arg_class, ArrayBase)
+        if passed_name in kernel.all_inames():
+            add_buf_arg(cl_arg_idx, kernel.index_dtype.numpy_dtype.char, passed_name)
+            cl_arg_idx += 1
+            continue
+
+        var_descr = kernel.get_var_descriptor(passed_name)
+
+        if not isinstance(var_descr, lp.ValueArg):
+            assert isinstance(var_descr, ArrayBase)
 
             # assume each of those generates exactly one...
             cl_arg_idx += 1
@@ -569,20 +582,20 @@ def generate_value_arg_setup(kernel, implemented_data_info):
             continue
 
         if not options.skip_arg_checks:
-            gen(If("%s is None" % idi.name,
-                Raise('RuntimeError("input argument \'{name}\' '
-                        'must be supplied")'.format(name=idi.name))))
+            gen(If(f"{passed_name} is None",
+                Raise('RuntimeError("input argument \'{var_descr.name}\' '
+                        'must be supplied")')))
 
-        if idi.dtype.is_composite():
+        if var_descr.dtype.is_composite():
             buf_indices_and_args.append(cl_arg_idx)
-            buf_indices_and_args.append(f"{idi.name}")
+            buf_indices_and_args.append(f"{passed_name}")
 
             cl_arg_idx += 1
 
-        elif idi.dtype.is_complex():
-            assert isinstance(idi.dtype, NumpyType)
+        elif var_descr.dtype.is_complex():
+            assert isinstance(var_descr.dtype, NumpyType)
 
-            dtype = idi.dtype
+            dtype = var_descr.dtype
 
             if dtype.numpy_dtype == np.complex64:
                 arg_char = "f"
@@ -594,21 +607,21 @@ def generate_value_arg_setup(kernel, implemented_data_info):
             buf_indices_and_args.append(cl_arg_idx)
             buf_indices_and_args.append(
                 f"_lpy_pack('{arg_char}{arg_char}', "
-                f"{idi.name}.real, {idi.name}.imag)")
+                f"{passed_name}.real, {passed_name}.imag)")
             cl_arg_idx += 1
 
             fp_arg_count += 2
 
-        elif isinstance(idi.dtype, NumpyType):
-            if idi.dtype.dtype.kind == "f":
+        elif isinstance(var_descr.dtype, NumpyType):
+            if var_descr.dtype.dtype.kind == "f":
                 fp_arg_count += 1
 
-            add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name)
+            add_buf_arg(cl_arg_idx, var_descr.dtype.dtype.char, passed_name)
             cl_arg_idx += 1
 
         else:
             raise LoopyError("do not know how to pass argument of type '%s'"
-                    % idi.dtype)
+                    % var_descr.dtype)
 
     for arg_kind, args_and_indices, entry_length in [
             ("_buf", buf_indices_and_args, 2),
@@ -625,18 +638,23 @@ def generate_value_arg_setup(kernel, implemented_data_info):
 # }}}
 
 
-def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx):
+def generate_array_arg_setup(kernel: LoopKernel, passed_names: Sequence[str],
+        arg_idx_to_cl_arg_idx: Mapping[int, int]) -> genpy.Generable:
     from loopy.kernel.array import ArrayBase
     from genpy import Statement as S, Suite
 
-    result = []
+    result: List[str] = []
     gen = result.append
 
-    cl_indices_and_args = []
-    for arg_idx, arg in enumerate(implemented_data_info):
-        if issubclass(arg.arg_class, ArrayBase):
+    cl_indices_and_args: List[Union[int, str]] = []
+    for arg_idx, passed_name in enumerate(passed_names):
+        if passed_name in kernel.all_inames():
+            continue
+
+        var_descr = kernel.get_var_descriptor(passed_name)
+        if isinstance(var_descr, ArrayBase):
             cl_indices_and_args.append(arg_idx_to_cl_arg_idx[arg_idx])
-            cl_indices_and_args.append(arg.name)
+            cl_indices_and_args.append(passed_name)
 
     if cl_indices_and_args:
         assert len(cl_indices_and_args) % 2 == 0
@@ -656,13 +674,18 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
 
     # {{{ code generation guts
 
-    def get_function_definition(self, codegen_state, codegen_result,
-            schedule_index, function_decl, function_body):
-        from loopy.kernel.data import TemporaryVariable
+    def get_function_definition(
+            self, codegen_state, codegen_result,
+            schedule_index: int, function_decl, function_body: genpy.Generable
+            ) -> genpy.Function:
+        assert schedule_index == 0
+
+        from loopy.schedule.tools import get_kernel_arg_info
+        kai = get_kernel_arg_info(codegen_state.kernel)
+
         args = (
                 ["_lpy_cl_kernels", "queue"]
-                + [idi.name for idi in codegen_state.implemented_data_info
-                    if not issubclass(idi.arg_class, TemporaryVariable)]
+                + [arg_name for arg_name in kai.passed_arg_names]
                 + ["wait_for=None", "allocator=None"])
 
         from genpy import (For, Function, Suite, Return, Line, Statement as S)
@@ -702,14 +725,6 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
 
     def get_temporary_decls(self, codegen_state, schedule_index):
         from genpy import Assign, Comment, Line
-        from collections import defaultdict
-        from numbers import Number
-        import pymbolic.primitives as prim
-
-        def alloc_nbytes(tv):
-            from functools import reduce
-            from operator import mul
-            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
 
         from pymbolic.mapper.stringifier import PREC_NONE
         ecm = self.get_expression_to_code_mapper(codegen_state)
@@ -718,37 +733,14 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         if not global_temporaries:
             return []
 
-        # {{{ allocate space for the base_storage
-
-        base_storage_sizes = defaultdict(set)
-
-        for tv in global_temporaries:
-            if tv.base_storage:
-                base_storage_sizes[tv.base_storage].add(tv.nbytes)
-
-        # }}}
-
         allocated_var_names = []
         code_lines = []
         code_lines.append(Line())
         code_lines.append(Comment("{{{ allocate global temporaries"))
         code_lines.append(Line())
 
-        for name, sizes in base_storage_sizes.items():
-            if all(isinstance(s, Number) for s in sizes):
-                size = max(sizes)
-            else:
-                size = prim.Max(tuple(sizes))
-
-            allocated_var_names.append(name)
-            code_lines.append(Assign(name,
-                                     f"allocator({ecm(size, PREC_NONE, 'i')})"))
-
         for tv in global_temporaries:
-            if tv.base_storage:
-                assert tv.base_storage in base_storage_sizes
-                code_lines.append(Assign(tv.name, tv.base_storage))
-            else:
+            if not tv.base_storage:
                 nbytes_str = ecm(tv.nbytes, PREC_NONE, "i")
                 allocated_var_names.append(tv.name)
                 code_lines.append(Assign(tv.name,
@@ -763,7 +755,12 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
 
         return code_lines
 
-    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+    def get_kernel_call(self,
+            codegen_state, subkernel_name, gsize, lsize):
+        from loopy.schedule.tools import get_subkernel_arg_info
+        skai = get_subkernel_arg_info(
+                codegen_state.kernel, subkernel_name)
+
         ecm = self.get_expression_to_code_mapper(codegen_state)
 
         if not gsize:
@@ -771,16 +768,10 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         if not lsize:
             lsize = (1,)
 
-        all_args = codegen_state.implemented_data_info + extra_args
-
         value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \
-            generate_value_arg_setup(
-                    codegen_state.kernel,
-                    all_args)
+            generate_value_arg_setup(codegen_state.kernel, skai.passed_names)
         arry_arg_code = generate_array_arg_setup(
-            codegen_state.kernel,
-            all_args,
-            arg_idx_to_cl_arg_idx)
+            codegen_state.kernel, skai.passed_names, arg_idx_to_cl_arg_idx)
 
         from genpy import Suite, Assign, Assert, Line, Comment
         from pymbolic.mapper.stringifier import PREC_NONE
@@ -794,10 +785,11 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
 
         # TODO: Generate finer-grained dependency structure
         return Suite([
-            Comment("{{{ enqueue %s" % name),
+            Comment("{{{ enqueue %s" % subkernel_name),
             Line(),
-            Assign("_lpy_knl", "_lpy_cl_kernels."+name),
-            Assert(f"_lpy_knl.num_args == {cl_arg_count}, f'Kernel \"{name}\" "
+            Assign("_lpy_knl", "_lpy_cl_kernels."+subkernel_name),
+            Assert(f"_lpy_knl.num_args == {cl_arg_count}, "
+                   f"f'Kernel \"{subkernel_name}\" "
                    f"invoker argument count ({cl_arg_count}) does not match the "
                    # No f"" here since {_lpy_knl.num_args} needs to be evaluated
                    # at runtime, not here.
@@ -858,7 +850,6 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
     def get_expression_to_c_expression_mapper(self, codegen_state):
         return ExpressionToPyOpenCLCExpressionMapper(codegen_state)
 
-
 # }}}
 
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index d20ddf1d2a632b75fbe4cd892a0ba40ea8b8a87f..d870a11b60177f4385d189ac066833d859cbb6f7 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -21,15 +21,31 @@ THE SOFTWARE.
 """
 
 
+from typing import Sequence, Tuple, Union, Callable, Any, Optional, TYPE_CHECKING
+from dataclasses import dataclass
+
 import numpy as np
+from immutables import Map
+
 from pytools import memoize_method
-from pytools.py_codegen import Indentation
+from pytools.codegen import Indentation, CodeGenerator
+
+from loopy.types import LoopyType
+from loopy.typing import ExpressionT
+from loopy.kernel import LoopKernel
+from loopy.kernel.data import ArrayArg
+from loopy.translation_unit import TranslationUnit
+from loopy.schedule.tools import KernelArgInfo
 from loopy.target.execution import (
-    KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels)
+    KernelExecutorBase, ExecutionWrapperGeneratorBase)
 import logging
 logger = logging.getLogger(__name__)
 
 
+if TYPE_CHECKING:
+    import pyopencl as cl
+
+
 # {{{ invoker generation
 
 # /!\ This code runs in a namespace controlled by the user.
@@ -90,18 +106,22 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ handle allocation of unspecified arguments
 
-    def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
+    def handle_alloc(
+            self, gen: CodeGenerator, arg: ArrayArg,
+            strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
+            skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguments for pyopencl execution
         """
         from pymbolic import var
 
-        num_axes = len(arg.strides)
+        from loopy.kernel.array import get_strides
+        strides = get_strides(arg)
+        num_axes = len(strides)
 
-        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+        itemsize = arg.dtype.numpy_dtype.itemsize
         for i in range(num_axes):
-            gen("_lpy_ustrides_%d = %s" % (i, strify(
-                arg.unvec_strides[i])))
+            gen("_lpy_ustrides_%d = %s" % (i, strify(strides[i])))
 
         if not skip_arg_checks:
             for i in range(num_axes):
@@ -109,12 +129,11 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                         "\"'%s' has negative stride in axis %d\""
                         % (i, arg.name, i))
 
+        assert isinstance(arg.shape, tuple)
         sym_ustrides = tuple(
                 var("_lpy_ustrides_%d" % i)
                 for i in range(num_axes))
-        sym_shape = tuple(
-                arg.unvec_shape[i]
-                for i in range(num_axes))
+        sym_shape = tuple(arg.shape[i] for i in range(num_axes))
 
         size_expr = (sum(astrd*(alen-1)
             for alen, astrd in zip(sym_shape, sym_ustrides))
@@ -123,7 +142,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen("_lpy_size = %s" % strify(size_expr))
         sym_strides = tuple(itemsize*s_i for s_i in sym_ustrides)
 
-        dtype_name = self.python_dtype_str(gen, kernel_arg.dtype.numpy_dtype)
+        dtype_name = self.python_dtype_str(gen, arg.dtype.numpy_dtype)
         gen(f"{arg.name} = _lpy_cl_array.Array(None, {strify(sym_shape)}, "
                 f"{dtype_name}, strides={strify(sym_strides)}, "
                 f"data=allocator({strify(itemsize * var('_lpy_size'))}), "
@@ -147,6 +166,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
         gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
         gen.add_to_preamble("from struct import pack as _lpy_pack")
+        from loopy.target.c.c_execution import DEF_EVEN_DIV_FUNCTION
+        gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION)
 
     def initialize_system_args(self, gen):
         """
@@ -159,8 +180,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args,
-            kernel, implemented_data_info):
+    def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
+            kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None:
         if kernel.options.cl_exec_manage_array_events:
             gen("""
                 if wait_for is None:
@@ -168,47 +189,35 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 """)
 
             gen("")
-            from loopy.kernel.data import ArrayArg
-            for arg in implemented_data_info:
-                if issubclass(arg.arg_class, ArrayArg):
+            for arg_name in kai.passed_arg_names:
+                arg = kernel.arg_dict[arg_name]
+                if isinstance(arg, ArrayArg):
                     gen(
                             "wait_for.extend({arg_name}.events)"
                             .format(arg_name=arg.name))
 
             gen("")
 
-        gen("_lpy_evt = {kernel_name}({args})"
-        .format(
-            kernel_name=kernel_name,
-            args=", ".join(
-                ["_lpy_cl_kernels", "queue"]
-                + args
-                + ["wait_for=wait_for", "allocator=allocator"])))
+        arg_list = (["_lpy_cl_kernels", "queue"]
+                + list(args)
+                + ["wait_for=wait_for", "allocator=allocator"])
+        gen(f"_lpy_evt = {host_program_name}({', '.join(arg_list)})")
 
         if kernel.options.cl_exec_manage_array_events:
             gen("")
-            from loopy.kernel.data import ArrayArg
-            for arg in implemented_data_info:
-                if (issubclass(arg.arg_class, ArrayArg)
-                        and arg.base_name in kernel.get_written_variables()):
+            for arg_name in kai.passed_arg_names:
+                arg = kernel.arg_dict[arg_name]
+                if (isinstance(arg, ArrayArg)
+                        and arg.name in kernel.get_written_variables()):
                     gen(f"{arg.name}.add_event(_lpy_evt)")
 
     # }}}
 
-    # {{{
+    # {{{ generate_output_handler
 
-    def generate_output_handler(
-            self, gen, options, kernel, implemented_data_info):
-
-        from loopy.kernel.data import KernelArgument
-
-        def is_output(idi):
-            from loopy.kernel.array import ArrayBase
-            if not issubclass(idi.arg_class, ArrayBase):
-                return False
-
-            arg = kernel.impl_arg_to_arg[idi.name]
-            return arg.is_output
+    def generate_output_handler(self, gen: CodeGenerator,
+            kernel: LoopKernel, kai: KernelArgInfo) -> None:
+        options = kernel.options
 
         if not options.no_numpy:
             gen("if out_host is None and (_lpy_encountered_numpy "
@@ -216,33 +225,28 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             with Indentation(gen):
                 gen("out_host = True")
 
-            for idi in implemented_data_info:
-                if not issubclass(idi.arg_class, KernelArgument):
-                    continue
-
-                if is_output(idi):
-                    np_name = "_lpy_%s_np_input" % idi.name
+            for arg_name in kai.passed_arg_names:
+                arg = kernel.arg_dict[arg_name]
+                if arg.is_output:
+                    np_name = "_lpy_%s_np_input" % arg.name
                     gen("if out_host or %s is not None:" % np_name)
                     with Indentation(gen):
                         gen("%s = %s.get(queue=queue, ary=%s)"
-                            % (idi.name, idi.name, np_name))
+                            % (arg.name, arg.name, np_name))
 
             gen("")
 
         if options.return_dict:
             gen("return _lpy_evt, {%s}"
-                    % ", ".join(f'"{idi.name}": {idi.name}'
-                        for idi in implemented_data_info
-                        if issubclass(idi.arg_class, KernelArgument)
-                        if is_output(idi)))
+                    % ", ".join(f'"{arg_name}": {arg_name}'
+                        for arg_name in kai.passed_arg_names
+                        if kernel.arg_dict[arg_name].is_output))
         else:
-            out_idis = [idi
-                    for idi in implemented_data_info
-                        if issubclass(idi.arg_class, KernelArgument)
-                    if is_output(idi)]
-            if out_idis:
+            out_names = [arg_name for arg_name in kai.passed_arg_names
+                    if kernel.arg_dict[arg_name].is_output]
+            if out_names:
                 gen("return _lpy_evt, (%s,)"
-                        % ", ".join(idi.name for idi in out_idis))
+                        % ", ".join(out_names))
             else:
                 gen("return _lpy_evt, ()")
 
@@ -257,6 +261,17 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 # }}}
 
 
+@dataclass(frozen=True)
+class _KernelInfo:
+    t_unit: TranslationUnit
+    cl_kernels: "_Kernels"
+    invoker: Callable[..., Any]
+
+
+class _Kernels:
+    pass
+
+
 # {{{ kernel executor
 
 class PyOpenCLKernelExecutor(KernelExecutorBase):
@@ -267,52 +282,45 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, context, program, entrypoint):
-        """
-        :arg context: a :class:`pyopencl.Context`
-        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
-            (a warning will be issued if more than one is returned). If the
-            kernel has not yet been loop-scheduled, that is done, too, with no
-            specific arguments.
-        """
-
-        super().__init__(program, entrypoint)
+    def __init__(self, context: "cl.Context", t_unit, entrypoint):
+        super().__init__(t_unit, entrypoint)
 
         self.context = context
 
-    def get_invoker_uncached(self, program, entrypoint, codegen_result):
+    def get_invoker_uncached(self, t_unit, entrypoint, codegen_result):
         generator = PyOpenCLExecutionWrapperGenerator()
-        return generator(program, entrypoint, codegen_result)
+        return generator(t_unit, entrypoint, codegen_result)
 
     def get_wrapper_generator(self):
         return PyOpenCLExecutionWrapperGenerator()
 
     @memoize_method
-    def translation_unit_info(self, entrypoint, arg_to_dtype_set=frozenset(),
-            all_kwargs=None):
-        program = self.get_typed_and_scheduled_translation_unit(
-                entrypoint, arg_to_dtype_set)
+    def translation_unit_info(
+            self, entrypoint: str,
+            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
+        t_unit = self.get_typed_and_scheduled_translation_unit(
+                entrypoint, arg_to_dtype)
 
         # FIXME: now just need to add the types to the arguments
         from loopy.codegen import generate_code_v2
         from loopy.target.execution import get_highlighted_code
-        codegen_result = generate_code_v2(program)
+        codegen_result = generate_code_v2(t_unit)
 
         dev_code = codegen_result.device_code()
 
-        if program[entrypoint].options.write_code:
+        if t_unit[entrypoint].options.write_code:
             #FIXME: redirect to "translation unit" level option as well.
             output = dev_code
-            if self.program[entrypoint].options.allow_terminal_colors:
+            if self.t_unit[entrypoint].options.allow_terminal_colors:
                 output = get_highlighted_code(output)
 
-            if self.program[entrypoint].options.write_code is True:
+            if self.t_unit[entrypoint].options.write_code is True:
                 print(output)
             else:
-                with open(self.program[entrypoint].options.write_code, "w") as outf:
+                with open(self.t_unit[entrypoint].options.write_code, "w") as outf:
                     outf.write(output)
 
-        if program[entrypoint].options.edit_code:
+        if t_unit[entrypoint].options.edit_code:
             #FIXME: redirect to "translation unit" level option as well.
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.cl")
@@ -322,18 +330,16 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         #FIXME: redirect to "translation unit" level option as well.
         cl_program = (
                 cl.Program(self.context, dev_code)
-                .build(options=program[entrypoint].options.build_options))
+                .build(options=t_unit[entrypoint].options.build_options))
 
         cl_kernels = _Kernels()
         for dp in cl_program.kernel_names.split(";"):
             setattr(cl_kernels, dp, getattr(cl_program, dp))
 
         return _KernelInfo(
-                program=program,
+                t_unit=t_unit,
                 cl_kernels=cl_kernels,
-                implemented_data_info=codegen_result.implemented_data_infos[
-                    entrypoint],
-                invoker=self.get_invoker(program, entrypoint, codegen_result))
+                invoker=self.get_invoker(t_unit, entrypoint, codegen_result))
 
     def __call__(self, queue, *,
             allocator=None, wait_for=None, out_host=None, entrypoint=None,
@@ -366,6 +372,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         assert entrypoint is not None
 
+        # FIXME: Remove entrypoint argument
+        assert entrypoint == self.entrypoint
+
         if __debug__:
             self.check_for_required_array_arguments(kwargs.keys())
 
@@ -373,7 +382,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             kwargs = self.packing_controller(kwargs)
 
         translation_unit_info = self.translation_unit_info(entrypoint,
-                self.arg_to_dtype_set(kwargs))
+                self.arg_to_dtype(kwargs))
 
         return translation_unit_info.invoker(
                 translation_unit_info.cl_kernels, queue, allocator, wait_for,
diff --git a/loopy/tools.py b/loopy/tools.py
index fe42918d8e5e3452f5ff11425dff17e499cd387b..5ae620bbab1582e1d90b1d29f4d0322b900a0600 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -21,7 +21,10 @@ THE SOFTWARE.
 """
 
 import collections.abc as abc
+from functools import cached_property
 
+from immutables import Map
+import islpy as isl
 import numpy as np
 from pytools import memoize_method, ProcessLogger
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
@@ -109,7 +112,13 @@ class LoopyKeyBuilder(KeyBuilderBase):
         getattr(prn, "print_"+key._base_name)(key)
         key_hash.update(prn.get_str().encode("utf8"))
 
-    update_for_Map = update_for_BasicSet  # noqa
+    def update_for_Map(self, key_hash, key):  # noqa
+        if isinstance(key, Map):
+            self.update_for_dict(key_hash, key)
+        elif isinstance(key, isl.Map):
+            self.update_for_BasicSet(key_hash, key)
+        else:
+            raise AssertionError()
 
     def update_for_pymbolic_expression(self, key_hash, key):
         if key is None:
@@ -673,8 +682,7 @@ class _CallablesUnresolver(RuleAwareIdentityMapper):
         self.callables_table = callables_table
         self.target = target
 
-    @property
-    @memoize_method
+    @cached_property
     def known_callables(self):
         from loopy.kernel.function_interface import CallableKernel
         return (frozenset(self.target.get_device_ast_builder().known_callables)
@@ -800,9 +808,9 @@ def _kernel_to_python(kernel, is_entrypoint=False, var_name="kernel"):
     % endif
             )
 
-    % for iname, tags in kernel.iname_to_tags.items():
-    % for tag in tags:
-    ${var_name} = lp.tag_inames(${var_name}, "${"%s:%s" %(iname, tag)}")
+    % for iname in kernel.inames.values():
+    % for tag in iname.tags:
+    ${var_name} = lp.tag_inames(${var_name}, "${"%s:%s" %(iname.name, tag)}")
     % endfor
 
     % endfor
@@ -856,8 +864,12 @@ def t_unit_to_python(t_unit, var_name="t_unit",
     knl_args = ", ".join(f"{name}_knl" for name in t_unit.callables_table)
     merge_stmt = f"{var_name} = lp.merge([{knl_args}])"
 
-    preamble_str = "\n".join(["import loopy as lp", "import numpy as np",
-                              "from pymbolic.primitives import *"])
+    preamble_str = "\n".join([
+        "import loopy as lp",
+        "import numpy as np",
+        "from pymbolic.primitives import *",
+        "import immutables",
+        ])
     body_str = "\n".join(knl_python_code_srcs + ["\n", merge_stmt])
 
     python_code = "\n".join([preamble_str, "\n", body_str])
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index a137e82c9dc34c5007768b0f6f3d0772c39c15b8..39b706e8313c5d1362c32245c725aedd77999b8b 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -20,14 +20,24 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from loopy.diagnostic import LoopyError
-from islpy import dim_type
+from warnings import warn
+
+from dataclasses import dataclass, replace
+
+from typing import Optional, Tuple, Dict
 
-from loopy.kernel.data import ImageArg
+import numpy as np
+
+from islpy import dim_type
 
 from pytools import MovedFunctionDeprecationWrapper
-from loopy.translation_unit import (TranslationUnit,
-                                    for_each_kernel)
+
+from loopy.diagnostic import LoopyError
+from loopy.kernel.data import ImageArg, auto, TemporaryVariable
+
+from loopy.types import LoopyType
+from loopy.typing import ExpressionT
+from loopy.translation_unit import TranslationUnit, for_each_kernel
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
@@ -578,7 +588,12 @@ def alias_temporaries(kernel, names, base_name_prefix=None,
     if base_name_prefix is None:
         base_name_prefix = "temp_storage"
 
-    vng = kernel.get_var_name_generator()
+    from pytools import UniqueNameGenerator
+    vng = UniqueNameGenerator(
+            kernel.all_variable_names()
+            | {tv.base_storage
+                for tv in kernel.temporary_variables.values()
+                if tv.base_storage is not None})
     base_name = vng(base_name_prefix)
 
     names_set = set(names)
@@ -623,7 +638,9 @@ def alias_temporaries(kernel, names, base_name_prefix=None,
                         .format(tv=tv.name))
 
             new_temporary_variables[tv.name] = \
-                    tv.copy(base_storage=base_name)
+                    tv.copy(
+                            base_storage=base_name,
+                            _base_storage_access_may_be_aliasing=False)
         else:
             new_temporary_variables[tv.name] = tv
 
@@ -952,4 +969,126 @@ def add_padding_to_avoid_bank_conflicts(kernel, device):
 # }}}
 
 
+# {{{ allocate_temporaries_for_base_storage
+
+@dataclass(frozen=True)
+class _BaseStorageInfo:
+    name: str
+    next_offset: ExpressionT
+    approx_nbytes: Optional[int] = None
+
+
+def _sym_max(a: ExpressionT, b: ExpressionT) -> ExpressionT:
+    from numbers import Number
+    if isinstance(a, Number) and isinstance(b, Number):
+        # https://github.com/python/mypy/issues/3186
+        return max(a, b)  # type: ignore[call-overload]
+    else:
+        from pymbolic.primitives import Max
+        return Max((a, b))
+
+
+@for_each_kernel
+def allocate_temporaries_for_base_storage(kernel: LoopKernel,
+        only_address_space: Optional[int] = None,
+        aliased=True,
+        max_nbytes: Optional[int] = None,
+        _implicitly_run=False,
+        ) -> LoopKernel:
+    from pytools import product
+
+    new_tvs = dict(kernel.temporary_variables)
+    made_changes = False
+
+    vng = kernel.get_var_name_generator()
+
+    name_aspace_dtype_to_bsi: Dict[Tuple[str, int, LoopyType], _BaseStorageInfo] = {}
+
+    for tv in sorted(
+            kernel.temporary_variables.values(),
+            key=lambda key_tv: key_tv.name):
+        if tv.base_storage and tv.initializer:
+            raise LoopyError(
+                    f"Temporary '{tv.name}' has both base_storage "
+                    "and an initializer. That's not allowed.")
+        if tv.offset and not tv.base_storage:
+            raise LoopyError(
+                    f"Temporary '{tv.name}' has an offset and no base_storage. "
+                    "That's not allowed.")
+
+        if (tv.base_storage
+                and tv.base_storage not in kernel.temporary_variables
+                and (
+                    only_address_space is None
+                    or tv.address_space == only_address_space)):
+            made_changes = True
+
+            if tv.address_space is auto:
+                raise LoopyError("Ahen allocating base storage for temporary "
+                        f"'{tv.name}', the address space of the temporary "
+                        "was not yet determined (set to 'auto').")
+
+            assert isinstance(tv.shape, tuple)
+            ary_size = product(si for si in tv.shape)
+            if isinstance(ary_size, (int, np.integer)):
+                approx_array_nbytes = tv.dtype.numpy_dtype.itemsize * ary_size
+            else:
+                # FIXME: Could use approximate values of ValueArgs
+                approx_array_nbytes = 0
+
+            bs_key = (tv.base_storage, tv.address_space, tv.dtype)
+            bsi = name_aspace_dtype_to_bsi.get(bs_key)
+
+            if bsi is None or (
+                    # are we out of space?
+                    not aliased
+                    and max_nbytes is not None
+                    and bsi.approx_nbytes is not None
+                    and bsi.approx_nbytes + approx_array_nbytes > max_nbytes):
+                bsi = name_aspace_dtype_to_bsi[bs_key] = _BaseStorageInfo(
+                        name=vng(tv.base_storage),
+                        next_offset=0,
+                        approx_nbytes=None if aliased else 0)
+
+                new_tvs[bsi.name] = TemporaryVariable(
+                    name=bsi.name,
+                    dtype=tv.dtype,
+                    shape=(0,),
+                    address_space=tv.address_space)
+
+            new_tvs[tv.name] = tv.copy(
+                base_storage=bsi.name,
+                offset=bsi.next_offset,
+                _base_storage_access_may_be_aliasing=(
+                    aliased if tv._base_storage_access_may_be_aliasing is None
+                    else tv._base_storage_access_may_be_aliasing))
+
+            bs_tv = new_tvs[bsi.name]
+            assert isinstance(bs_tv.shape, tuple)
+            bs_size, = bs_tv.shape
+            if aliased:
+                new_bs_size = _sym_max(bs_size, ary_size)
+            else:
+                new_bs_size = bs_size + ary_size
+
+                assert bsi.approx_nbytes is not None
+                name_aspace_dtype_to_bsi[bs_key] = replace(bsi,
+                    next_offset=bsi.next_offset + ary_size,
+                    approx_nbytes=bsi.approx_nbytes + approx_array_nbytes)
+
+            new_tvs[bsi.name] = new_tvs[bsi.name].copy(shape=(new_bs_size,))
+
+    if made_changes:
+        if _implicitly_run:
+            warn("Base storage allocation was performed implicitly during "
+                    "preprocessing. This is deprecated and will stop working "
+                    "in 2023. Call loopy.allocate_temporaries_for_base_storage "
+                    "explicitly to aovid this warning.", DeprecationWarning)
+
+        return kernel.copy(temporary_variables=new_tvs)
+    else:
+        return kernel
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 6e28d9e7b969372a714af78a3b772f0052347e39..2fd39cfc257f2d0a52cf9b70ec89af6200c96c81 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -23,6 +23,7 @@ THE SOFTWARE.
 
 import islpy as isl
 from islpy import dim_type
+from immutables import Map
 
 from loopy.diagnostic import LoopyError
 from pymbolic import var
@@ -103,7 +104,7 @@ def _ordered_merge_lists(list_a, list_b):
 
 
 def _merge_dicts(item_name, dict_a, dict_b):
-    result = dict_a.copy()
+    result = dict(dict_a)
 
     for k, v in dict_b.items():
         if k in result:
@@ -114,7 +115,10 @@ def _merge_dicts(item_name, dict_a, dict_b):
         else:
             result[k] = v
 
-    return result
+    if isinstance(dict_a, Map):
+        return Map(result)
+    else:
+        return result
 
 
 def _merge_values(item_name, val_a, val_b):
@@ -242,8 +246,6 @@ def _fuse_two_kernels(kernela, kernelb):
             preamble_generators=_ordered_merge_lists(
                 kernela.preamble_generators, kernelb.preamble_generators),
             assumptions=new_assumptions,
-            local_sizes=_merge_dicts(
-                "local size", kernela.local_sizes, kernelb.local_sizes),
             temporary_variables=new_temporaries,
             inames=_merge_dicts(
                 "inames",
@@ -276,7 +278,8 @@ def _fuse_two_kernels(kernela, kernelb):
                 "target",
                 kernela.target,
                 kernelb.target),
-            options=kernela.options), old_b_id_to_new_b_id
+            options=kernela.options,
+            tags=kernela.tags | kernelb.tags), old_b_id_to_new_b_id
 
 # }}}
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 9c40b6faf7a5efeecd197848793d7451140eb070..b3ccc647e8ec1dd380174d2a36a02ef2f2498163 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -245,7 +245,7 @@ def _split_iname_backend(kernel, iname_to_split,
         raise ValueError(
                 f"cannot split loop for unknown variable '{iname_to_split}'")
 
-    applied_iname_rewrites = kernel.applied_iname_rewrites[:]
+    applied_iname_rewrites = list(kernel.applied_iname_rewrites)
 
     vng = kernel.get_var_name_generator()
 
@@ -284,8 +284,7 @@ def _split_iname_backend(kernel, iname_to_split,
 
     # }}}
 
-    iname_slab_increments = kernel.iname_slab_increments.copy()
-    iname_slab_increments[outer_iname] = slabs
+    iname_slab_increments = kernel.iname_slab_increments.set(outer_iname, slabs)
 
     new_priorities = []
     for prio in kernel.loop_priority:
@@ -626,7 +625,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
             .copy(
                 instructions=new_insns,
                 domains=domch.get_domains_with(new_domain),
-                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
+                applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,)
                 ))
 
     from loopy.match import parse_stack_match
@@ -1396,7 +1395,7 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
             rule_mapping_context.finish_kernel(
                 old_to_new.map_kernel(kernel))
             .copy(
-                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
+                applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,)
                 ))
 
     # }}}
@@ -1987,7 +1986,7 @@ def map_domain(kernel, transform_map):
 
     substitutions = {}
     var_substitutions = {}
-    applied_iname_rewrites = kernel.applied_iname_rewrites[:]
+    applied_iname_rewrites = kernel.applied_iname_rewrites
 
     from loopy.symbolic import aff_to_expr
     from pymbolic import var
@@ -1997,7 +1996,7 @@ def map_domain(kernel, transform_map):
         substitutions[iname] = subst_from_map
         var_substitutions[var(iname)] = subst_from_map
 
-    applied_iname_rewrites.append(var_substitutions)
+    applied_iname_rewrites = applied_iname_rewrites + (var_substitutions,)
     del var_substitutions
 
     # }}}
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index 2f8e3abe80dfdb6b098835ef0d95b5034e52450f..58f596dc6f0af85bce8406074e7837618613cc71 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -725,6 +725,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
                     TemporaryVariable(
                         name=new_assignee_name,
                         dtype=None,
+                        shape=(),
                         address_space=AddressSpace.PRIVATE))
 
             from pymbolic import var
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 3ced06bfc9fb0afb660b9a37a571a10fc203387a..6bf1c1543d47c2db4805c1403a617ebe9c255a85 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -20,12 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from functools import cached_property
+
+from immutables import Map
 
 from loopy.diagnostic import LoopyError
 import loopy as lp
 
 from loopy.kernel.data import auto, AddressSpace
 from pytools import memoize_method, Record
+from loopy.kernel.data import Iname
 from loopy.schedule import (
             EnterLoop, LeaveLoop, RunInstruction,
             CallKernel, ReturnFromKernel, Barrier)
@@ -242,8 +246,7 @@ class TemporarySaver:
         from collections import defaultdict
         self.insns_to_insert = []
         self.insns_to_update = {}
-        self.extra_args_to_add = {}
-        self.updated_iname_to_tags = {}
+        self.updated_iname_objs = Map()
         self.updated_temporary_variables = {}
 
         # temporary name -> save or reload insn ids
@@ -295,8 +298,7 @@ class TemporarySaver:
 
         return frozenset(accessing_insns_in_subkernel)
 
-    @property
-    @memoize_method
+    @cached_property
     def base_storage_to_temporary_map(self):
         from collections import defaultdict
 
@@ -309,8 +311,7 @@ class TemporarySaver:
 
         return result
 
-    @property
-    @memoize_method
+    @cached_property
     def subkernel_to_slice_indices(self):
         result = {}
 
@@ -322,8 +323,7 @@ class TemporarySaver:
 
         return result
 
-    @property
-    @memoize_method
+    @cached_property
     def subkernel_to_surrounding_inames(self):
         current_outer_inames = set()
         within_subkernel = False
@@ -502,7 +502,7 @@ class TemporarySaver:
         if promoted_temporary is None:
             return
 
-        new_subdomain, hw_inames, dim_inames, iname_to_tags = (
+        new_subdomain, hw_inames, dim_inames, iname_objs = (
             self.augment_domain_for_save_or_reload(
                 self.new_subdomain, promoted_temporary, mode, subkernel))
 
@@ -580,7 +580,7 @@ class TemporarySaver:
         self.updated_temporary_variables[promoted_temporary.name] = (
             promoted_temporary.as_kernel_temporary(self.kernel))
 
-        self.updated_iname_to_tags.update(iname_to_tags)
+        self.updated_iname_objs = self.updated_iname_objs.update(iname_objs)
 
     @memoize_method
     def finish(self):
@@ -596,7 +596,7 @@ class TemporarySaver:
         new_instructions.extend(
             sorted(insns_to_insert.values(), key=lambda insn: insn.id))
 
-        self.updated_iname_to_tags.update(self.kernel.iname_to_tags)
+        self.updated_iname_objs = self.updated_iname_objs.update(self.kernel.inames)
         self.updated_temporary_variables.update(self.kernel.temporary_variables)
 
         new_domains = list(self.kernel.domains)
@@ -607,7 +607,7 @@ class TemporarySaver:
         kernel = self.kernel.copy(
             domains=new_domains,
             instructions=new_instructions,
-            iname_to_tags=self.updated_iname_to_tags,
+            inames=self.updated_iname_objs,
             temporary_variables=self.updated_temporary_variables,
             overridden_get_grid_sizes_for_insn_ids=None)
 
@@ -649,7 +649,7 @@ class TemporarySaver:
         orig_dim = domain.dim(isl.dim_type.set)
 
         # Tags for newly added inames
-        iname_to_tags = {}
+        iname_objs = {}
 
         from loopy.symbolic import aff_from_expr
 
@@ -674,7 +674,8 @@ class TemporarySaver:
                 # If the temporary has local scope, then loads / stores can
                 # be done in parallel.
                 from loopy.kernel.data import AutoFitLocalInameTag
-                iname_to_tags[new_iname] = frozenset([AutoFitLocalInameTag()])
+                iname_objs[new_iname] = Iname(
+                        new_iname, tags=frozenset([AutoFitLocalInameTag()]))
 
             dim_inames.append(new_iname)
 
@@ -704,7 +705,8 @@ class TemporarySaver:
                 &
                 aff[new_iname].lt_set(aff_from_expr(domain.space, dim)))
 
-            self.updated_iname_to_tags[new_iname] = frozenset([hw_tag])
+            self.updated_iname_objs = self.updated_iname_objs.set(new_iname,
+                    Iname(name=new_iname, tags=frozenset([hw_tag])))
             hw_inames.append(new_iname)
 
         # The operations on the domain above return a Set object, but the
@@ -712,7 +714,7 @@ class TemporarySaver:
         domain_list = domain.get_basic_set_list()
         assert domain_list.n_basic_set() == 1
         domain = domain_list.get_basic_set(0)
-        return domain, hw_inames, dim_inames, iname_to_tags
+        return domain, hw_inames, dim_inames, iname_objs
 
 # }}}
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 8568f350875d428bf486f693cc8ddfba97bb9f14..13c99de1d88281499a19acf953d31f7a61e494f7 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -337,7 +337,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         def_id = set()
         for dep_id in insn.depends_on:
             dep_insn = id_to_insn[dep_id]
-            if lhs_name in dep_insn.write_dependency_names():
+            if lhs_name in dep_insn.assignee_var_names():
                 if lhs_name in dep_insn.read_dependency_names():
                     raise LoopyError("instruction '%s' both reads *and* "
                             "writes '%s'--cannot transcribe to substitution "
@@ -378,7 +378,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 
     definition_insn_ids = set()
     for insn in kernel.instructions:
-        if lhs_name in insn.write_dependency_names():
+        if lhs_name in insn.assignee_var_names():
             definition_insn_ids.add(insn.id)
 
     # }}}
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 96e436966385eb64c501f007cf74fbc1f60abc8d..8e9ad5bd5af0b9cf08b2e05c53f97c380d532f7a 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -855,6 +855,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx):
     from loopy.kernel.data import TemporaryVariable, KernelArgument
 
     old_calls_to_new_calls = {}
+    touched_variable_names = set()
 
     for var_chain in sccs:
         changed_during_last_queue_run = False
@@ -896,6 +897,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx):
                 if new_dtype != item.dtype:
                     debug("     changed from: %s", item.dtype)
                     changed_during_last_queue_run = True
+                    touched_variable_names.add(name)
 
                     if isinstance(item, TemporaryVariable):
                         new_temp_vars[name] = item.copy(dtype=new_dtype)
@@ -1001,6 +1003,18 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx):
     logger.debug("type inference took {dur:.2f} seconds".format(
             dur=end_time - start_time))
 
+    if kernel._separation_info():
+        sep_names = set(kernel._separation_info()) | {
+                sep_info.subarray_names.values()
+                for sep_info in kernel._separation_info().values()}
+
+        touched_sep_names = sep_names & touched_variable_names
+        if touched_sep_names:
+            raise LoopyError("Type inference must not touch variables subject to "
+                    "separation after separation has been performed. "
+                    "Untyped separation-related variables: "
+                    f"{', '.join(touched_sep_names)}")
+
     pre_type_specialized_knl = unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
diff --git a/loopy/types.py b/loopy/types.py
index 14bf89fa197bf1aacbf1488ca53b8cfeebe9a174..57b9548bbaba3e36ec0af9ec5739ce93633a2992 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -20,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from typing import Any
+from typing import Any, Mapping
 from warnings import warn
 import numpy as np
 
@@ -247,11 +249,11 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
                 "found '%s' instead" % type(dtype))
 
 
-_TO_UNSIGNED_MAPPING = {
-        np.int8: np.uint8,
-        np.int16: np.uint16,
-        np.int32: np.uint32,
-        np.int64: np.uint64,
+_TO_UNSIGNED_MAPPING: Mapping[np.dtype[Any], np.dtype[Any]] = {
+        np.dtype(np.int8): np.dtype(np.uint8),
+        np.dtype(np.int16): np.dtype(np.uint16),
+        np.dtype(np.int32): np.dtype(np.uint32),
+        np.dtype(np.int64): np.dtype(np.uint64),
         }
 
 
@@ -261,7 +263,7 @@ def to_unsigned_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]":
     if dtype.kind != "i":
         raise ValueError("can only convert integer types to unsigned")
 
-    return _TO_UNSIGNED_MAPPING[dtype.type]
+    return _TO_UNSIGNED_MAPPING[dtype]
 
 
 # vim: foldmethod=marker
diff --git a/loopy/typing.py b/loopy/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6714d87048fb0419f14641993e4929da709b6fc
--- /dev/null
+++ b/loopy/typing.py
@@ -0,0 +1,38 @@
+__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from typing import Union, Tuple
+
+import numpy as np
+
+from pymbolic.primitives import Expression
+
+IntegralT = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8,
+                  np.uint16, np.uint32, np.uint64]
+FloatT = Union[float, complex, np.float32, np.float64, np.complex64,
+        np.complex128]
+
+
+ExpressionT = Union[IntegralT, FloatT, Expression]
+ShapeType = Tuple[ExpressionT, ...]
+StridesType = ShapeType
diff --git a/run-mypy.sh b/run-mypy.sh
new file mode 100755
index 0000000000000000000000000000000000000000..46fe61defd71d26e046b4b61d2f0a8a2c91bc532
--- /dev/null
+++ b/run-mypy.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python -m mypy --show-error-codes loopy # examples test
diff --git a/setup.cfg b/setup.cfg
index 0a2df03be2613b4945fdfbb42b63f09522c94fef..3b0411bfc0704c78515a67eb2af7f5433a8a11db 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -13,3 +13,48 @@ multiline-quotes = """
 
 [tool:pytest]
 doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS
+
+[mypy]
+python_version = 3.8
+warn_unused_ignores = True
+
+exclude = (?x)(
+        loopy/target/c/compyte/ndarray/.*
+        | loopy/target/c/compyte/array\.py
+        )
+
+[mypy-islpy.*]
+ignore_missing_imports = True
+
+[mypy-pymbolic.*]
+ignore_missing_imports = True
+
+[mypy-cgen.*]
+ignore_missing_imports = True
+
+[mypy-genpy.*]
+ignore_missing_imports = True
+
+[mypy-pyopencl.*]
+ignore_missing_imports = True
+
+[mypy-pygments.*]
+ignore_missing_imports = True
+
+[mypy-colorama.*]
+ignore_missing_imports = True
+
+[mypy-codepy.*]
+ignore_missing_imports = True
+
+[mypy-mako.*]
+ignore_missing_imports = True
+
+[mypy-fparser.*]
+ignore_missing_imports = True
+
+[mypy-ply.*]
+ignore_missing_imports = True
+
+[mypy-IPython.*]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
index 011db206a4e0918030a1f75c2c094c5cc5ea4857..f265326e7534860e80ef1e91f63ea181a75de423 100644
--- a/setup.py
+++ b/setup.py
@@ -82,7 +82,7 @@ setup(name="loopy",
           "Topic :: Utilities",
           ],
 
-      python_requires="~=3.6",
+      python_requires="~=3.8",
       install_requires=[
           "pytools>=2022.1.2",
           "pymbolic>=2022.1",
diff --git a/test/test_callables.py b/test/test_callables.py
index 8ac29f39ad5f99a42094c9ff7cf44ba373e3d66e..1a1b37f2c70f635cc355d41f6d366a1f554e5b6a 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -1342,6 +1342,38 @@ def test_c_instruction_in_callee(ctx_factory, inline):
     assert out.get() == (n-1)
 
 
+def test_global_temp_var_with_base_storage(ctx_factory):
+    ctx = ctx_factory()
+    cq = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{[i, i2] : 0<=i,i2<3}",
+        """
+        a[i] = 5
+        b[i] = a[i] + 1
+        ... gbarrier
+        c[i2] = b[i2] + 2
+        d[i2] = c[i2] + 3
+        """, [
+            lp.TemporaryVariable("a", dtype=np.int32, shape=(3,),
+                address_space=lp.AddressSpace.GLOBAL, base_storage="bs"),
+            lp.TemporaryVariable("b", dtype=np.int32, shape=(3,),
+                address_space=lp.AddressSpace.GLOBAL, base_storage="bs"),
+            lp.TemporaryVariable("c", dtype=np.int32, shape=(3,),
+                address_space=lp.AddressSpace.GLOBAL, base_storage="bs"),
+            ...
+        ],
+        seq_dependencies=True)
+
+    knl = lp.allocate_temporaries_for_base_storage(knl, aliased=False)
+
+    cl_prg = cl.Program(ctx, lp.generate_code_v2(knl).device_code()).build()
+    assert [knl.num_args for knl in cl_prg.all_kernels()] == [1, 2]
+
+    _evt, (d,) = knl(cq)
+    assert (d.get() == 5 + 1 + 2 + 3).all()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_expression.py b/test/test_expression.py
index 9b4e1fbc5deda2f1313b816564061a78e43de8f3..451594dc0192832ee7d66d30d9fce0e914026a29 100644
--- a/test/test_expression.py
+++ b/test/test_expression.py
@@ -482,7 +482,8 @@ def test_divide_precedence(ctx_factory):
             x[0] = c*(a/b)
             y[0] = c*(a%b)
             """,
-            [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)])
+            [lp.ValueArg("a, b, c", np.int32),
+                lp.GlobalArg("x, y", np.int32, shape=lp.auto)])
     print(lp.generate_code_v2(knl).device_code())
 
     evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 4b1146667c0d889c95ec956d59e8609f69223644..80a59981dda41c1005038923b578a74dac901fb9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -24,6 +24,7 @@ import sys
 import numpy as np
 import loopy as lp
 import pyopencl as cl
+import pyopencl.array  # noqa
 import pyopencl.clmath  # noqa
 import pyopencl.clrandom  # noqa
 import pytest
@@ -203,23 +204,6 @@ def test_owed_barriers():
     print(lp.generate_code_v2(knl))
 
 
-def test_wg_too_small():
-    knl = lp.make_kernel(
-            "{[i]: 0<=i<100}",
-            [
-                "<float32> z[i] = a[i] {id=copy}"
-                ],
-            [lp.GlobalArg("a", np.float32, shape=(100,))],
-            target=lp.PyOpenCLTarget(),
-            local_sizes={0: 16})
-
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-
-    print(knl)
-    with pytest.raises(RuntimeError):
-        print(lp.generate_code_v2(knl))
-
-
 def test_multi_cse():
     knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
@@ -227,8 +211,7 @@ def test_multi_cse():
                 "<float32> z[i] = a[i] + a[i]**2"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
-            target=lp.PyOpenCLTarget(),
-            local_sizes={0: 16})
+            target=lp.PyOpenCLTarget())
 
     knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", [])
@@ -766,6 +749,23 @@ def test_make_copy_kernel(ctx_factory):
     assert (a1 == a3).all()
 
 
+def test_make_copy_kernel_with_offsets(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    a1 = np.random.randn(3, 1024, 4)
+    a1_dev = cl.array.to_device(queue, a1)
+
+    cknl1 = lp.make_copy_kernel("c,c,c", "sep,c,c")
+
+    cknl1 = lp.fix_parameters(cknl1, n0=3)
+
+    cknl1 = lp.set_options(cknl1, write_code=True)
+    evt, (a2_dev,) = cknl1(queue, input=a1_dev)
+
+    assert (a1 == a2_dev.get()).all()
+
+
 def test_auto_test_can_detect_problems(ctx_factory):
     ctx = ctx_factory()
 
@@ -1400,7 +1400,7 @@ def test_global_temporary(ctx_factory):
 
     assert len(cgr.device_programs) == 2
 
-    #print(cgr.device_code())
+    print(cgr.device_code())
     #print(cgr.host_code())
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
@@ -1841,7 +1841,7 @@ def test_header_extract():
     cuknl = knl.copy(target=lp.CudaTarget())
     assert str(lp.generate_header(cuknl)[0]) == (
             'extern "C" __global__ void __launch_bounds__(1) '
-            "loopy_kernel(float *__restrict__ T);")
+            "loopy_kernel(__global__ float *__restrict__ T);")
 
     #test OpenCL
     oclknl = knl.copy(target=lp.PyOpenCLTarget())
@@ -1859,8 +1859,12 @@ def test_scalars_with_base_storage(ctx_factory):
     knl = lp.make_kernel(
             [isl.BasicSet("[] -> {[]: }")],  # empty (domain w/unused inames errors)
             "a = 1",
-            [lp.TemporaryVariable("a", dtype=np.float64,
-                                  shape=(), base_storage="base")])
+            [
+                lp.TemporaryVariable("a", dtype=np.float64,
+                                  shape=(), base_storage="base"),
+                lp.TemporaryVariable("b", dtype=np.float64,
+                                  shape=(), base_storage="base"),
+                ])
 
     knl(queue, out_host=True)
 
@@ -2392,20 +2396,6 @@ def test_inames_conditional_generation(ctx_factory):
         knl(queue)
 
 
-def test_kernel_var_name_generator():
-    prog = lp.make_kernel(
-            "{[i]: 0 <= i <= 10}",
-            """
-            <>a = 0
-            <>b_s0 = 0
-            """)
-
-    vng = prog["loopy_kernel"].get_var_name_generator()
-
-    assert vng("a_s0") != "a_s0"
-    assert vng("b") != "b"
-
-
 def test_fixed_parameters(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
diff --git a/test/test_misc.py b/test/test_misc.py
index 0e8a528ecc0784e4280ca8e4e0d3f417b3db145c..7feb5ae73b5618b0c3d7219db1252fe9df1f7daa 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from pickle import loads, dumps
+
 import pytest
 import loopy as lp
 
@@ -32,6 +34,16 @@ logger = logging.getLogger(__name__)
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
+def test_kernel_pickling_and_hashing():
+    knl = lp.make_kernel("{[i]: 0<=i<10}",
+                         """
+                         y[i] = i
+                         """)
+    from loopy.tools import LoopyKeyBuilder
+    reconst_knl = loads(dumps(knl))
+    assert LoopyKeyBuilder()(knl) == LoopyKeyBuilder()(reconst_knl)
+
+
 def test_SetTrie():
     from loopy.kernel.tools import SetTrie
 
@@ -79,8 +91,6 @@ def test_LazilyUnpicklingDict():
 
     assert not cls.instance_unpickled
 
-    from pickle import loads, dumps
-
     pickled_mapping = dumps(mapping)
 
     # {{{ test lazy loading
@@ -133,7 +143,6 @@ def test_LazilyUnpicklingList():
     lst = LazilyUnpicklingList([cls()])
     assert not cls.instance_unpickled
 
-    from pickle import loads, dumps
     pickled_lst = dumps(lst)
 
     # {{{ test lazy loading
@@ -188,7 +197,6 @@ def test_LazilyUnpicklingListWithEqAndPersistentHashing():
     from loopy.tools import LazilyUnpicklingListWithEqAndPersistentHashing
 
     cls = PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing
-    from pickle import loads, dumps
 
     # {{{ test comparison of a pair of lazy lists
 
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 3d54b231145db0fed43113a2b22d265d77148f36..6d97104a9542340db8c2c227105ef7175e55547c 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -51,6 +51,11 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     pytest.importorskip("fparser")
     ctx = ctx_factory()
 
+    if (ctx.devices[0].platform.name == "Portable Computing Language"
+            and ilp_multiple > 1):
+        # about 400s, cf. https://gitlab.tiker.net/inducer/loopy/-/jobs/421250#L937
+        pytest.skip("takes a very long time to compile on pocl")
+
     filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90")
     with open(filename) as sourcef:
         source = sourcef.read()
diff --git a/test/test_target.py b/test/test_target.py
index 6a0c26fd9b9ce769366e112441b7549fce150b8c..b6acd0902e75b7480a6891b9320df5248cce882b 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -54,7 +54,7 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-def test_ispc_target(occa_mode=False):
+def test_ispc_target():
     from loopy.target.ispc import ISPCTarget
 
     knl = lp.make_kernel(
@@ -64,7 +64,7 @@ def test_ispc_target(occa_mode=False):
                 lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                 "..."
                 ],
-            target=ISPCTarget(occa_mode=occa_mode))
+            target=ISPCTarget())
 
     knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
@@ -254,39 +254,6 @@ def test_clamp(ctx_factory):
     evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
 
 
-def test_numba_target():
-    knl = lp.make_kernel(
-        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
-        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
-        target=lp.NumbaTarget())
-
-    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})
-
-    print(lp.generate_code_v2(knl).device_code())
-
-
-def test_numba_cuda_target():
-    knl = lp.make_kernel(
-        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
-        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
-        target=lp.NumbaCudaTarget())
-
-    knl = lp.assume(knl, "M>0")
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
-    knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1))
-    knl = lp.add_prefetch(knl, "X[i,:]",
-            fetch_outer_inames="i_inner, i_outer, j_inner",
-            default_tag="l.auto")
-    knl = lp.fix_parameters(knl, N=3)
-    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
-    knl = lp.tag_inames(knl, "k:unr")
-    knl = lp.tag_array_axes(knl, "X", "N0,N1")
-
-    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})
-
-    print(lp.generate_code_v2(knl).all_code())
-
-
 def test_sized_integer_c_codegen(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -557,7 +524,7 @@ def test_input_args_are_required(ctx_factory):
         """
         g[i] = f[i] + 1.5
         """,
-        [lp.GlobalArg("f, g", dtype="float64"), ...]
+        [lp.GlobalArg("f, g", shape=lp.auto, dtype="float64"), ...]
     )
 
     knl2 = lp.make_kernel(
@@ -582,7 +549,7 @@ def test_input_args_are_required(ctx_factory):
         f[i] = 3.
         g[i] = f[i] + 1.5
         """,
-        [lp.GlobalArg("f, g", dtype="float64"), ...]
+        [lp.GlobalArg("f, g", shape=lp.auto, dtype="float64"), ...]
     )
 
     # FIXME: this should not raise!