diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7d8101763de864e20bd92c6be0d1fef0e31d1b31..05b2e323793ee19e202f6e89425e26f5f9fb2582 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,8 +4,6 @@ on:
         branches:
         - master
     pull_request:
-        paths-ignore:
-        - 'doc/*.rst'
     schedule:
         - cron:  '17 3 * * 0'
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d69f0b8c489c07d3aa1512f6f1cbb8ced0f6a2e9..f0e9aa0e593784742a9c2587c6e037f0b111d127 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -89,6 +89,8 @@ Python 3 POCL Examples:
   - python3
   - pocl
   - large-node
+  # For examples/python/ispc-stream-harness.py
+  - avx2
   except:
   - tags
 
diff --git a/doc/conf.py b/doc/conf.py
index 942afcd3ce11056c65c6a7500bb5ed312dc40187..9b8cf81e11dbbaee53110c36b1e601a80ae0104b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,52 +1,35 @@
-#
-# loopy documentation build configuration file, created by
-# sphinx-quickstart on Tue Aug  9 13:40:49 2011.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-#import sys
 import os
 
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-
 # -- General configuration -----------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+#needs_sphinx = "1.0"
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+# coming with Sphinx (named "sphinx.ext.*") or your custom ones.
 extensions = [
-        'sphinx.ext.autodoc',
-        'sphinx.ext.intersphinx',
-        #'sphinx.ext.viewcode',
-        'sphinx.ext.doctest',
+        "sphinx.ext.autodoc",
+        "sphinx.ext.intersphinx",
+        #"sphinx.ext.viewcode",
+        "sphinx.ext.doctest",
+        "sphinx_copybutton",
         ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+#source_encoding = "utf-8-sig"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = 'loopy'
-copyright = '2016, Andreas Klöckner'
+project = "loopy"
+copyright = "2016, Andreas Klöckner"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -59,7 +42,7 @@ with open(_version_source) as vpy_file:
     version_py = vpy_file.read()
 
 os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
-exec(compile(version_py, _version_source, 'exec'), ver_dic)
+exec(compile(version_py, _version_source, "exec"), ver_dic)
 version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
@@ -77,7 +60,7 @@ del os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
 
 # The reST default role (used for this markup: `text`) to use for all documents.
 #default_role = None
@@ -94,7 +77,7 @@ exclude_patterns = ['_build']
 #show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
@@ -102,135 +85,16 @@ pygments_style = 'sphinx'
 
 # -- Options for HTML output ---------------------------------------------------
 
-html_theme = "alabaster"
+html_theme = "furo"
 
 html_theme_options = {
-        "extra_nav_links": {
-            "🚀 Github": "https://github.com/inducer/loopy",
-            "💾 Download Releases": "https://pypi.org/project/loopy",
-            }
         }
 
 html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
-}
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+        }
 
 # If true, links to the reST sources are added to the pages.
-html_show_sourcelink = False
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'loopydoc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
-        ('index', 'loopy.tex', 'loopy Documentation',
-            'Andreas Kloeckner', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
+html_show_sourcelink = True
 
 
 # -- Options for manual page output --------------------------------------------
@@ -238,20 +102,21 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'loopy', 'loopy Documentation',
-     ['Andreas Kloeckner'], 1)
+    ("index", "loopy", "loopy Documentation",
+     ["Andreas Kloeckner"], 1)
 ]
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    'https://docs.python.org/3': None,
-    'https://documen.tician.de/islpy': None,
-    'https://documen.tician.de/pyopencl': None,
-    'https://documen.tician.de/cgen': None,
-    'https://docs.scipy.org/doc/numpy/': None,
-    'https://documen.tician.de/pymbolic': None,
-    'https://documen.tician.de/pytools': None,
+    "https://docs.python.org/3": None,
+    "https://numpy.org/doc/stable/": None,
+    "https://documen.tician.de/islpy": None,
+    "https://documen.tician.de/pyopencl": None,
+    "https://documen.tician.de/cgen": None,
+    "https://documen.tician.de/pymbolic": None,
+    "https://documen.tician.de/pytools": None,
     }
 
 autoclass_content = "class"
+autodoc_typehints = "description"
diff --git a/doc/index.rst b/doc/index.rst
index 8eb996f6b48b4b2526b2114c10fbe94669f87b44..7baff3249a25e69019c06802901538500c1af971 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -46,6 +46,8 @@ Please check :ref:`installation` to get started.
     ref_other
     misc
     ref_internals
+    🚀 Github <https://github.com/inducer/loopy>
+    💾 Download Releases <https://pypi.org/project/loopy>
 
 Indices and tables
 ==================
diff --git a/doc/misc.rst b/doc/misc.rst
index 4c8c9867f3ceee2447f9249097c7c30f4d6f501d..e8bcefc65ea5519eafb3ad8b1ec34774b64314ed 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -49,21 +49,18 @@ MacOS support computers:
     Everywhere else, just making sure you have the ``g++`` package should be
     enough.
 
-#.  Install `miniconda <https://conda.io/miniconda.html>`_.
-    (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.)
+#.  Install `miniforge <https://github.com/conda-forge/miniforge>`_.
 
-#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3``
+#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniforge3``
 
     If you accepted the default location, this should work:
 
-    ``export CONDA=$HOME/miniconda3``
+    ``export CONDA=$HOME/miniforge3``
 
 #.  ``$CONDA/bin/conda create -n dev``
 
 #.  ``source $CONDA/bin/activate dev``
 
-#.  ``conda config --add channels conda-forge``
-
 #.  ``conda install git pip pocl islpy pyopencl`` (Linux)
 
     or
@@ -76,7 +73,7 @@ MacOS support computers:
 
 Next time you want to use :mod:`loopy`, just run the following command::
 
-    source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev
+    source /WHERE/YOU/INSTALLED/miniforge3/bin/activate dev
 
 You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
 
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index efe147493df36df2f7afa4bac4241b88bb5ce598..2b496c77deeaa58be05ce13021a42dd78d2f9ded 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -3,6 +3,72 @@
 Reference: Loopy's Model of a Kernel
 ====================================
 
+What Types of Computation can a Loopy Program Express?
+------------------------------------------------------
+
+Loopy programs consist of an a-priori unordered set of statements, operating
+on :math:`n`-dimensional array variables.
+
+Arrays consist of "plain old data" and structures thereof, as describable
+by a :class:`numpy.dtype`.  The n-dimensional shape of these arrays is
+given by a tuple of expressions at most affine in parameters that are
+fixed for the duration of program execution.
+Each array variable in the program is either an argument or a temporary
+variable.  A temporary variable is only live within the program, while
+argument variables are accessible outside the program and constitute the
+program's inputs and outputs.
+
+A statement (still called 'instruction' in some places, cf.
+:class:`loopy.InstructionBase`) encodes an assignment to an entry of an array.
+The right-hand side of an assignment consists of an expression that may
+consist of arithmetic operations and calls to functions.
+If the outermost operation of the RHS expression is a function call,
+the RHS value may be a tuple, and multiple (still scalar) arrays appear
+as LHS values. (This is the only sense in which tuple types are supported.)
+Each statement is parametrized by zero or more loop variables ("inames").
+A statement is executed once for each integer point defined by the domain
+forest for the iname tuple given for that statement
+(:attr:`loopy.InstructionBase.within_inames`). Each execution of a
+statement (with specific values of the inames) is called a *statement
+instance*.  Dependencies between these instances as well as instances of
+other statements are encoded in the program representation and specify permissible
+execution orderings.  (The semantics of the dependencies are `being
+sharpened <https://github.com/inducer/loopy/pull/168>`__.) Assignments
+(comprising the evaluation of the RHS and the assignment to the LHS) may
+be specified to be atomic.
+
+The basic building blocks of the domain forest are sets given as
+conjunctions of equalities and inequalities of quasi-affine expressions on
+integer tuples, called domains, and represented as instances of
+:class:`islpy.BasicSet`. The entries of each integer tuple are
+either *parameters* or *inames*. Each domain may optionally have a *parent
+domain*. Parameters of parent-less domains are given by value arguments
+supplied to the program that will remain unchanged during program
+execution. Parameters of domains with parents may be
+
+- run-time-constant value arguments to the program, or
+- inames from parent domains, or
+- scalar, integer temporary variables that are written by statements
+  with iteration domains controlled by a parent domain.
+
+For each tuple of concrete parameter values, the set of iname tuples must be
+finite. Each iname is defined by exactly one domain.
+
+For a tuple of inames, the domain forest defines an iteration domain
+by finding all the domains defining the inames involved, along with their
+parent domains. The resulting tree of domains may contain multiple roots,
+but no branches. The iteration domain is then constructed by intersecting
+these domains and constructing the projection of that set onto the space
+given by the required iname tuple. Observe that, via the parent-child
+domain mechanism, imperfectly-nested and data-dependent loops become
+expressible.
+
+The set of functions callable from the language is predefined by the system.
+Additional functions may be defined by the user by registering them. It is
+not currently possible to define functions from within Loopy, however work
+is progressing on permitting this. Even once this is allowed, recursion
+will not be permitted.
+
 .. _domain-tree:
 
 Loop Domain Forest
diff --git a/loopy/check.py b/loopy/check.py
index e66af04d2fe4dfc2e1f5a99281783feecec2bee7..0bf02f7cf7425f0a277a200a1bdc51c60347fd57 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -24,8 +24,7 @@ THE SOFTWARE.
 from islpy import dim_type
 import islpy as isl
 from loopy.symbolic import WalkMapper
-from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning,
-        warn_with_kernel, ExpressionToAffineConversionError)
+from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel
 from loopy.type_inference import TypeInferenceMapper
 from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
         CInstruction, _DataObliviousInstruction)
@@ -216,7 +215,7 @@ def check_for_double_use_of_hw_axes(kernel):
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
-        for iname in kernel.insn_inames(insn):
+        for iname in insn.within_inames:
             for tag in kernel.iname_tags_of_type(iname, UniqueTag):
                 key = tag.key
                 if key in insn_tag_keys:
@@ -233,12 +232,12 @@ def check_for_inactive_iname_access(kernel):
     for insn in kernel.instructions:
         expression_inames = insn.read_dependency_names() & kernel.all_inames()
 
-        if not expression_inames <= kernel.insn_inames(insn):
+        if not expression_inames <= insn.within_inames:
             raise LoopyError(
                     "instruction '%s' references "
                     "inames '%s' that the instruction does not depend on"
                     % (insn.id,
-                        ", ".join(expression_inames - kernel.insn_inames(insn))))
+                        ", ".join(expression_inames - insn.within_inames)))
 
 
 def check_for_unused_inames(kernel):
@@ -294,7 +293,7 @@ def check_for_write_races(kernel):
                 insn.assignee_var_names(),
                 insn.assignee_subscript_deps()):
             assignee_inames = assignee_indices & kernel.all_inames()
-            if not assignee_inames <= kernel.insn_inames(insn):
+            if not assignee_inames <= insn.within_inames:
                 raise LoopyError(
                         "assignee of instructions '%s' references "
                         "iname that the instruction does not depend on"
@@ -305,13 +304,13 @@ def check_for_write_races(kernel):
                 # will cause write races.
 
                 raceable_parallel_insn_inames = {
-                    iname for iname in kernel.insn_inames(insn)
+                    iname for iname in insn.within_inames
                     if kernel.iname_tags_of_type(iname, ConcurrentTag)}
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
                 raceable_parallel_insn_inames = {
-                        iname for iname in kernel.insn_inames(insn)
+                        iname for iname in insn.within_inames
                         if any(_is_racing_iname_tag(temp_var, tag)
                             for tag in kernel.iname_tags(iname))}
 
@@ -445,19 +444,14 @@ class _AccessCheckMapper(WalkMapper):
                         % (expr, self.insn_id, access_range, shape_domain))
 
     def map_if(self, expr, domain):
-        from loopy.symbolic import get_dependencies
-        if get_dependencies(expr.condition) <= frozenset(
-                domain.space.get_var_dict()):
-            try:
-                from loopy.symbolic import isl_set_from_expr
-                then_set = isl_set_from_expr(domain.space, expr.condition)
-                else_set = then_set.complement()
-            except ExpressionToAffineConversionError:
-                # non-affine condition: can't do much
-                then_set = else_set = isl.BasicSet.universe(domain.space)
-        else:
-            # data-dependent condition: can't do much
+        from loopy.symbolic import condition_to_set
+        then_set = condition_to_set(domain.space, expr.condition)
+        if then_set is None:
+            # condition cannot be inferred as ISL expression => ignore
+            # for domain contributions enforced by it
             then_set = else_set = isl.BasicSet.universe(domain.space)
+        else:
+            else_set = then_set.complement()
 
         self.rec(expr.then, domain & then_set)
         self.rec(expr.else_, domain & else_set)
@@ -467,9 +461,10 @@ def check_bounds(kernel):
     """
     Performs out-of-bound check for every array access.
     """
+    from loopy.kernel.instruction import get_insn_domain
     temp_var_names = set(kernel.temporary_variables)
     for insn in kernel.instructions:
-        domain = kernel.get_inames_domain(kernel.insn_inames(insn))
+        domain = get_insn_domain(insn, kernel)
 
         # data-dependent bounds? can't do much
         if set(domain.get_var_names(dim_type.param)) & temp_var_names:
@@ -496,7 +491,7 @@ def check_write_destinations(kernel):
             if wvar in kernel.all_inames():
                 raise LoopyError("iname '%s' may not be written" % wvar)
 
-            insn_domain = kernel.get_inames_domain(kernel.insn_inames(insn))
+            insn_domain = kernel.get_inames_domain(insn.within_inames)
             insn_params = set(insn_domain.get_var_names(dim_type.param))
 
             if wvar in kernel.all_params():
@@ -941,7 +936,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
             group_axes_used = set()
             local_axes_used = set()
 
-            for iname in kernel.insn_inames(insn):
+            for iname in insn.within_inames:
                 ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
                 gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1)
                 altags = kernel.iname_tags_of_type(
@@ -1197,7 +1192,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
         assert idomains
 
-        insn_inames = kernel.insn_inames(insn)
+        insn_inames = insn.within_inames
 
         # {{{ if we've checked the same thing before, no need to check it again
 
@@ -1274,7 +1269,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
                 iname_to_dim = pt.get_space().get_var_dict()
                 point_axes = []
-                for iname in kernel.insn_inames(insn) | parameter_inames:
+                for iname in insn_inames | parameter_inames:
                     tp, dim = iname_to_dim[iname]
                     point_axes.append("%s=%d" % (
                         iname, pt.get_coordinate_val(tp, dim).to_python()))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index cbae4eac5ed796090c52c40a7fde4b6ebeed36a0..0f5d824cc752a372023cc177c780b2606593a0f7 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -146,13 +146,18 @@ class SeenFunction(ImmutableRecord):
     .. attribute:: arg_dtypes
 
         a tuple of arg dtypes
+
+    .. attribute:: result_dtypes
+
+        a tuple of result dtypes
     """
 
-    def __init__(self, name, c_name, arg_dtypes):
+    def __init__(self, name, c_name, arg_dtypes, result_dtypes):
         ImmutableRecord.__init__(self,
                 name=name,
                 c_name=c_name,
-                arg_dtypes=arg_dtypes)
+                arg_dtypes=arg_dtypes,
+                result_dtypes=result_dtypes)
 
 
 class CodeGenerationState:
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 71133ef7cf2a29be1a8673e99a81f21544f5404a..14efb64f4618c025a319564ebef3e0232800aecc 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -89,7 +89,7 @@ def generate_instruction_code(codegen_state, insn):
     else:
         raise RuntimeError("unexpected instruction type")
 
-    insn_inames = kernel.insn_inames(insn)
+    insn_inames = insn.within_inames
 
     return to_codegen_result(
             codegen_state,
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 9088f3bfe5f56c42884ab196c4cfc04d8341e3ef..b24cde2c419cc3fb549473cb620e040520a29a07 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -824,7 +824,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         result = {
                 iname: set() for iname in self.all_inames()}
         for insn in self.instructions:
-            for iname in self.insn_inames(insn):
+            for iname in insn.within_inames:
                 result[iname].add(insn.id)
 
         return result
@@ -1561,10 +1561,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         for field_name in self.hash_fields:
             key_builder.rec(key_hash, getattr(self, field_name))
 
+    @memoize_method
     def __hash__(self):
         from loopy.tools import LoopyKeyBuilder
-        from pytools.persistent_dict import new_hash
-        key_hash = new_hash()
+        import hashlib
+        key_hash = hashlib.sha256()
         self.update_persistent_hash(key_hash, LoopyKeyBuilder())
         return hash(key_hash.digest())
 
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6b0248f4f9c18001ef23b0c1551316d9cb6ad065..9fd166ab8f15bdc97006c94c7d03977b64c08292 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 import re
 
 from pytools import ImmutableRecord, memoize_method
+from pytools.tag import Taggable
 
 import numpy as np  # noqa
 
@@ -136,6 +137,12 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
         return self.stringify(True)
 
     def map_expr(self, mapper):
+        from loopy.kernel.data import auto
+
+        if self.stride is auto:
+            # lp.auto not an expr => do not map
+            return self
+
         return self.copy(stride=mapper(self.stride))
 
 
@@ -557,7 +564,7 @@ def _parse_shape_or_strides(x):
     return tuple(_pymbolic_parse_if_necessary(xi) for xi in x)
 
 
-class ArrayBase(ImmutableRecord):
+class ArrayBase(ImmutableRecord, Taggable):
     """
     .. attribute :: name
 
@@ -600,7 +607,8 @@ class ArrayBase(ImmutableRecord):
     .. attribute:: offset
 
         Offset from the beginning of the buffer to the point from
-            which the strides are counted. May be one of
+        which the strides are counted, in units of the :attr:`dtype`.
+        May be one of
 
             * 0 or None
             * a string (that is interpreted as an argument name).
@@ -636,6 +644,14 @@ class ArrayBase(ImmutableRecord):
 
         .. versionadded:: 2018.1
 
+    .. attribute:: tags
+
+        A (possibly empty) frozenset of instances of
+        :class:`pytools.tag.Tag` intended for
+        consumption by an application.
+
+        .. versionadded:: 2020.2.2
+
     .. automethod:: __init__
     .. automethod:: __eq__
     .. automethod:: num_user_axes
@@ -652,8 +668,7 @@ class ArrayBase(ImmutableRecord):
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
             dim_names=None, strides=None, order=None, for_atomic=False,
-            target=None, alignment=None,
-            **kwargs):
+            target=None, alignment=None, tags=None, **kwargs):
         """
         All of the following (except *name*) are optional.
         Specify either strides or shape.
@@ -691,7 +706,8 @@ class ArrayBase(ImmutableRecord):
             using atomic-capable data types.
         :arg offset: (See :attr:`offset`)
         :arg alignment: memory alignment in bytes
-
+        :arg tags: An instance of or an Iterable of instances of
+            :class:`pytools.tag.Tag`.
         """
 
         for kwarg_name in kwargs:
@@ -848,6 +864,7 @@ class ArrayBase(ImmutableRecord):
                 order=order,
                 alignment=alignment,
                 for_atomic=for_atomic,
+                tags=tags,
                 **kwargs)
 
     def __eq__(self, other):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index a22fef9e8021d55759a9f0a2c0f4f23bfe35df80..94534382f19790936152661b48d4d515e9e0e129 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1523,7 +1523,7 @@ def determine_shapes_of_temporaries(knl):
     def feed_all_expressions(receiver):
         for insn in knl.instructions:
             insn.with_transformed_expressions(
-                lambda expr: receiver(expr, knl.insn_inames(insn)))
+                lambda expr: receiver(expr, insn.within_inames))
 
     var_to_base_indices, var_to_shape, var_to_error = (
         find_shapes_of_vars(
@@ -1543,7 +1543,7 @@ def determine_shapes_of_temporaries(knl):
         def feed_assignee_of_instruction(receiver):
             for insn in knl.instructions:
                 for assignee in insn.assignees:
-                    receiver(assignee, knl.insn_inames(insn))
+                    receiver(assignee, insn.within_inames)
 
         var_to_base_indices_fallback, var_to_shape_fallback, var_to_error = (
             find_shapes_of_vars(
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 43770ffb6d0f2ae08d8967baa03fedea669343ed..6e454d925167fd6344a7d4cd30c83f28f6ac2e23 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 from sys import intern
 import numpy as np  # noqa
 from pytools import ImmutableRecord
+from pytools.tag import Taggable
 from loopy.kernel.array import ArrayBase
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import (  # noqa
@@ -357,7 +358,6 @@ class KernelArgument(ImmutableRecord):
                     DeprecationWarning, stacklevel=2)
 
             dtype = None
-
         kwargs["dtype"] = dtype
 
         ImmutableRecord.__init__(self, **kwargs)
@@ -379,13 +379,13 @@ class ArrayArg(ArrayBase, KernelArgument):
 
     allowed_extra_kwargs = [
             "address_space",
-            "is_output_only"]
+            "is_output_only",
+            "tags"]
 
     def __init__(self, *args, **kwargs):
         if "address_space" not in kwargs:
             raise TypeError("'address_space' must be specified")
         kwargs["is_output_only"] = kwargs.pop("is_output_only", False)
-
         super().__init__(*args, **kwargs)
 
     min_target_axes = 0
@@ -451,15 +451,29 @@ class ImageArg(ArrayBase, KernelArgument):
                 self.num_target_axes(), dtype, is_written)
 
 
-class ValueArg(KernelArgument):
+"""
+    :attribute tags: A (possibly empty) frozenset of instances of
+        :class:`pytools.tag.Tag` intended for consumption by an
+        application.
+
+        ..versionadded: 2020.2.2
+"""
+
+
+class ValueArg(KernelArgument, Taggable):
     def __init__(self, name, dtype=None, approximately=1000, target=None,
-            is_output_only=False):
+            is_output_only=False, tags=None):
+        """
+        :arg tags: A an instance of or Iterable of instances of
+            :class:`pytools.tag.Tag` intended for consumption by an
+            application.
+        """
 
         KernelArgument.__init__(self, name=name,
                 dtype=dtype,
                 approximately=approximately,
                 target=target,
-                is_output_only=is_output_only)
+                is_output_only=is_output_only, tags=tags)
 
     def __str__(self):
         import loopy as lp
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 791ea89a6521c58cfe9281723ea8d83f83baf84a..101d16624c6698bf6f8ac45c5154b0fab4e6e9f5 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -25,6 +25,7 @@ from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional
 from warnings import warn
+import islpy as isl
 
 
 # {{{ instructions: base class
@@ -1438,4 +1439,49 @@ def _check_and_fix_temp_var_type(temp_var_type, stacklevel=2):
 # }}}
 
 
+def get_insn_domain(insn, kernel):
+    """
+    Returns an instance of :class:`islpy.Set` for the *insn*'s domain.
+
+    .. note::
+
+        Does not take into account additional hints available through
+        :attr:`loopy.LoopKernel.assumptions`.
+    """
+    domain = kernel.get_inames_domain(insn.within_inames)
+
+    # {{{ add read-only ValueArgs to domain
+
+    from loopy.kernel.data import ValueArg
+
+    valueargs_to_add = ({arg.name for arg in kernel.args
+                         if isinstance(arg, ValueArg)
+                         and arg.name not in kernel.get_written_variables()}
+                        - set(domain.get_var_names(isl.dim_type.param)))
+
+    # only consider valueargs relevant to *insn*
+    valueargs_to_add = valueargs_to_add & insn.read_dependency_names()
+
+    for arg_to_add in valueargs_to_add:
+        idim = domain.dim(isl.dim_type.param)
+        domain = domain.add_dims(isl.dim_type.param, 1)
+        domain = domain.set_dim_name(isl.dim_type.param, idim, arg_to_add)
+
+    # }}}
+
+    # {{{ enforce restriction from predicates
+
+    insn_preds_set = isl.BasicSet.universe(domain.space)
+
+    for predicate in insn.predicates:
+        from loopy.symbolic import condition_to_set
+        predicate_as_isl_set = condition_to_set(domain.space, predicate)
+        if predicate_as_isl_set is not None:
+            insn_preds_set = insn_preds_set & predicate_as_isl_set
+
+    # }}}
+
+    return domain & insn_preds_set
+
+
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 0b8d9841ee77020149a1f246a301e9c422b202e6..541bb45ce52821d00e3e255ad600c392f535d303 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -685,7 +685,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     from loopy.kernel.data import AutoLocalIndexTagBase
     auto_axis_inames = {
-        iname for iname in kernel.insn_inames(insn)
+        iname for iname in insn.within_inames
         if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)}
 
     # }}}
@@ -744,7 +744,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
     if aggregate_strides:
         very_large_stride = int(np.iinfo(np.int32).max)
 
-        return sorted((iname for iname in kernel.insn_inames(insn)),
+        return sorted((iname for iname in insn.within_inames),
                 key=lambda iname: (
                     aggregate_strides.get(iname, very_large_stride),
                     iname))
@@ -885,7 +885,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             continue
 
         auto_axis_inames = [
-            iname for iname in kernel.insn_inames(insn)
+            iname for iname in insn.within_inames
             if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)]
 
         if not auto_axis_inames:
@@ -893,7 +893,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
         assigned_local_axes = set()
 
-        for iname in kernel.insn_inames(insn):
+        for iname in insn.within_inames:
             tags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
             if tags:
                 tag, = tags
@@ -1000,7 +1000,7 @@ def guess_var_shape(kernel, var_name):
     submap = SubstitutionRuleExpander(kernel.substitutions)
 
     def run_through_armap(expr):
-        armap(submap(expr), kernel.insn_inames(insn))
+        armap(submap(expr), insn.within_inames)
         return expr
 
     try:
@@ -1533,7 +1533,7 @@ def stringify_instruction_list(kernel):
             raise LoopyError("unexpected instruction type: %s"
                     % type(insn).__name__)
 
-        adapt_to_new_inames_list(kernel.insn_inames(insn))
+        adapt_to_new_inames_list(insn.within_inames)
 
         options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL]
         if insn.priority:
diff --git a/loopy/options.py b/loopy/options.py
index 2dc8f22cd8a205da89d86b5157af8792a37111ed..46ff37947b66c02e3751a815ab660d9807e86724 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -98,6 +98,12 @@ class Options(ImmutableRecord):
         Do not do any checking (data type, data layout, shape,
         etc.) on arguments for a minor performance gain.
 
+        .. versionchanged:: 2021.1
+
+            This now defaults to the same value as the ``optimize``
+            sub-flag from :data:`sys.flags`. This flag can be controlled
+            (i.e. set to *True*) by running Python with the ``-O`` flag.
+
     .. attribute:: no_numpy
 
         Do not check for or accept :mod:`numpy` arrays as
@@ -196,6 +202,7 @@ class Options(ImmutableRecord):
         allow_terminal_colors_def = (
                 ALLOW_TERMINAL_COLORS and allow_terminal_colors_def)
 
+        import sys
         ImmutableRecord.__init__(
                 self,
 
@@ -203,7 +210,7 @@ class Options(ImmutableRecord):
                 trace_assignments=kwargs.get("trace_assignments", False),
                 trace_assignment_values=kwargs.get("trace_assignment_values", False),
 
-                skip_arg_checks=kwargs.get("skip_arg_checks", False),
+                skip_arg_checks=kwargs.get("skip_arg_checks", sys.flags.optimize),
                 no_numpy=kwargs.get("no_numpy", False),
                 cl_exec_manage_array_events=kwargs.get("no_numpy", True),
                 return_dict=kwargs.get("return_dict", False),
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 12f1cb4691cf749ebd147c65582900c9ce3dce04..40b5827343ae7c4cf2fb2886d88a5324c930285a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -256,7 +256,6 @@ def find_temporary_address_space(kernel):
 
         overall_aspace = max(desired_aspace_per_insn)
 
-        from pytools import all
         if not all(iaspace == overall_aspace for iaspace in desired_aspace_per_insn):
             raise LoopyError("not all instructions agree on the "
                     "the desired address space (private/local/global) of  the "
@@ -1004,7 +1003,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     def map_reduction_seq(expr, rec, nresults, arg_dtypes,
             reduction_dtypes):
-        outer_insn_inames = temp_kernel.insn_inames(insn)
+        outer_insn_inames = insn.within_inames
 
         from loopy.kernel.data import AddressSpace
         acc_var_names = make_temporaries(
@@ -1041,7 +1040,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         update_id = insn_id_gen(
                 based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
 
-        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
+        update_insn_iname_deps = insn.within_inames | set(expr.inames)
         if insn.within_inames_is_final:
             update_insn_iname_deps = insn.within_inames | set(expr.inames)
 
@@ -1126,7 +1125,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         size = _get_int_iname_size(red_iname)
 
-        outer_insn_inames = temp_kernel.insn_inames(insn)
+        outer_insn_inames = insn.within_inames
 
         from loopy.kernel.data import LocalIndexTagBase
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
@@ -1363,7 +1362,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
     def map_scan_seq(expr, rec, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
-        outer_insn_inames = temp_kernel.insn_inames(insn)
+        outer_insn_inames = insn.within_inames
         inames_to_remove.add(scan_iname)
 
         track_iname = var_name_gen(
@@ -1417,7 +1416,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         update_id = insn_id_gen(
                 based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
 
-        update_insn_iname_deps = temp_kernel.insn_inames(insn) | {track_iname}
+        update_insn_iname_deps = insn.within_inames | {track_iname}
         if insn.within_inames_is_final:
             update_insn_iname_deps = insn.within_inames | {track_iname}
 
@@ -1461,7 +1460,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             return map_reduction_seq(
                     expr, rec, nresults, arg_dtypes, reduction_dtypes)
 
-        outer_insn_inames = temp_kernel.insn_inames(insn)
+        outer_insn_inames = insn.within_inames
 
         from loopy.kernel.data import LocalIndexTagBase
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
@@ -1668,7 +1667,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
                         temp_kernel, expr, unknown_types_ok))
 
-        outer_insn_inames = temp_kernel.insn_inames(insn)
+        outer_insn_inames = insn.within_inames
         bad_inames = frozenset(expr.inames) & outer_insn_inames
         if bad_inames:
             raise LoopyError("reduction used within loop(s) that it was "
@@ -1854,7 +1853,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     no_sync_with=insn.no_sync_with
                     | frozenset(new_insn_add_no_sync_with),
                     within_inames=(
-                        temp_kernel.insn_inames(insn)
+                        insn.within_inames
                         | new_insn_add_within_inames))
 
             kwargs.pop("id")
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 936c7c4d605cfcaebe57d4d61b862000b0b3bc3c..ccfe0d5ff9b403b9ed68bfabf7d69ec36bd66b57 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -182,7 +182,6 @@ def has_barrier_within(kernel, sched_index):
     if isinstance(sched_item, BeginBlockItem):
         loop_contents, _ = gather_schedule_block(
                 kernel.schedule, sched_index)
-        from pytools import any
         return any(isinstance(subsched_item, Barrier)
                 for subsched_item in loop_contents)
     elif isinstance(sched_item, Barrier):
@@ -296,7 +295,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                     continue
 
                 dep_insn = kernel.id_to_insn[dep_insn_id]
-                dep_insn_inames = kernel.insn_inames(dep_insn)
+                dep_insn_inames = dep_insn.within_inames
 
                 if iname in dep_insn_inames:
                     # Nothing to be learned, dependency is in loop over iname
@@ -940,7 +939,7 @@ def generate_loop_schedules_internal(
         if not is_ready:
             continue
 
-        want = kernel.insn_inames(insn) - sched_state.parallel_inames
+        want = insn.within_inames - sched_state.parallel_inames
         have = active_inames_set - sched_state.parallel_inames
 
         if want != have:
@@ -1046,8 +1045,9 @@ def generate_loop_schedules_internal(
                     sched_state.active_group_counts.keys()):
                 new_insn_ids_to_try = None
 
-            new_toposorted_insns = sched_state.insns_in_topologically_sorted_order[:]
-            new_toposorted_insns.remove(insn)
+            # explicitly use id to compare to avoid performance issues like #199
+            new_toposorted_insns = [x for x in
+                sched_state.insns_in_topologically_sorted_order if x.id != insn.id]
 
             # }}}
 
@@ -1106,7 +1106,7 @@ def generate_loop_schedules_internal(
 
             for insn_id in sched_state.unscheduled_insn_ids:
                 insn = kernel.id_to_insn[insn_id]
-                if last_entered_loop in kernel.insn_inames(insn):
+                if last_entered_loop in insn.within_inames:
                     if debug_mode:
                         print("cannot leave '%s' because '%s' still depends on it"
                                 % (last_entered_loop, format_insn(kernel, insn.id)))
@@ -1294,7 +1294,7 @@ def generate_loop_schedules_internal(
             for insn_id in reachable_insn_ids:
                 insn = kernel.id_to_insn[insn_id]
 
-                want = kernel.insn_inames(insn)
+                want = insn.within_inames
 
                 if hypothetically_active_loops <= want:
                     if usefulness is None:
diff --git a/loopy/schedule/checker/lexicographic_order_map.py b/loopy/schedule/checker/lexicographic_order_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9066030fbe499508d568ab561739fa8c31e07e5
--- /dev/null
+++ b/loopy/schedule/checker/lexicographic_order_map.py
@@ -0,0 +1,198 @@
+# coding: utf-8
+__copyright__ = "Copyright (C) 2019 James Stevens"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import islpy as isl
+
+
+def get_statement_ordering_map(
+        sched_before, sched_after, lex_map, before_marker="'"):
+    """Return a statement ordering represented as a map from each statement
+        instance to all statement instances occurring later.
+
+    :arg sched_before: An :class:`islpy.Map` representing a schedule
+        as a mapping from statement instances (for one particular statement)
+        to lexicographic time. The statement represented will typically
+        be the dependee in a dependency relationship.
+
+    :arg sched_after: An :class:`islpy.Map` representing a schedule
+        as a mapping from statement instances (for one particular statement)
+        to lexicographic time. The statement represented will typically
+        be the depender in a dependency relationship.
+
+    :arg lex_map: An :class:`islpy.Map` representing a lexicographic
+        ordering as a mapping from each point in lexicographic time
+        to every point that occurs later in lexicographic time. E.g.::
+
+            {[i0', i1', i2', ...] -> [i0, i1, i2, ...] :
+                i0' < i0 or (i0' = i0 and i1' < i1)
+                or (i0' = i0 and i1' = i1 and i2' < i2) ...}
+
+    :arg before_marker: A :class:`str` to be appended to the names of the
+        map dimensions representing the 'before' statement in the
+        'happens before' relationship.
+
+    :returns: An :class:`islpy.Map` representing the statement odering as
+        a mapping from each statement instance to all statement instances
+        occurring later. I.e., we compose relations B, L, and A as
+        B ∘ L ∘ A^-1, where B is `sched_before`, A is `sched_after`,
+        and L is `lex_map`.
+
+    """
+
+    # Perform the composition of relations
+    sio = sched_before.apply_range(
+        lex_map).apply_range(sched_after.reverse())
+
+    # Append marker to in_ dims
+    from loopy.schedule.checker.utils import (
+        append_marker_to_isl_map_var_names,
+    )
+    return append_marker_to_isl_map_var_names(
+        sio, isl.dim_type.in_, before_marker)
+
+
+def get_lex_order_set(before_names, after_names, islvars=None):
+    """Return an :class:`islpy.Set` representing a lexicographic ordering
+        with the number of dimensions provided in `before_names`
+        (equal to the number of dimensions in `after_names`).
+
+    :arg before_names: A list of :class:`str` variable names to be used
+        to describe lexicographic space dimensions for a point in a lexicographic
+        ordering that occurs before another point, which will be represented using
+        `after_names`. (see example below)
+
+    :arg after_names: A list of :class:`str` variable names to be used
+        to describe lexicographic space dimensions for a point in a lexicographic
+        ordering that occurs after another point, which will be represented using
+        `before_names`. (see example below)
+
+    :arg islvars: A dictionary mapping variable names in `before_names` and
+        `after_names` to :class:`islpy.PwAff` instances that represent each
+        of the variables (islvars may be produced by `islpy.make_zero_and_vars`).
+        The key '0' is also include and represents a :class:`islpy.PwAff` zero
+        constant. This dictionary defines the space to be used for the set. If no
+        value is passed, the dictionary will be made using `before_names`
+        and `after_names`.
+
+    :returns: An :class:`islpy.Set` representing a big-endian lexicographic ordering
+        with the number of dimensions provided in `before_names`. The set
+        has one dimension for each name in *both* `before_names` and
+        `after_names`, and contains all points which meet a 'happens before'
+        constraint defining the lexicographic ordering. E.g., if
+        `before_names = [i0', i1', i2']` and `after_names = [i0, i1, i2]`,
+        return the set containing all points in a 3-dimensional, big-endian
+        lexicographic ordering such that point
+        `[i0', i1', i2']` happens before `[i0, i1, i2]`. I.e., return::
+
+            {[i0', i1', i2', i0, i1, i2] :
+                i0' < i0 or (i0' = i0 and i1' < i1)
+                or (i0' = i0 and i1' = i1 and i2' < i2)}
+
+    """
+
+    # If no islvars passed, make them using the names provided
+    if islvars is None:
+        islvars = isl.make_zero_and_vars(before_names+after_names, [])
+
+    # Initialize set with constraint i0' < i0
+    lex_order_set = islvars[before_names[0]].lt_set(islvars[after_names[0]])
+
+    # For each dim d, starting with d=1, equality_conj_set will be constrained
+    # by d equalities, e.g., (i0' = i0 and i1' = i1 and ... i(d-1)' = i(d-1)).
+    equality_conj_set = islvars[0].eq_set(islvars[0])  # initialize to 'true'
+
+    for i in range(1, len(before_names)):
+
+        # Add the next equality constraint to equality_conj_set
+        equality_conj_set = equality_conj_set & \
+            islvars[before_names[i-1]].eq_set(islvars[after_names[i-1]])
+
+        # Create a set constrained by adding a less-than constraint for this dim,
+        # e.g., (i1' < i1), to the current equality conjunction set.
+        # For each dim d, starting with d=1, this full conjunction will have
+        # d equalities and one inequality, e.g.,
+        # (i0' = i0 and i1' = i1 and ... i(d-1)' = i(d-1) and id' < id)
+        full_conj_set = islvars[before_names[i]].lt_set(
+            islvars[after_names[i]]) & equality_conj_set
+
+        # Union this new constraint with the current lex_order_set
+        lex_order_set = lex_order_set | full_conj_set
+
+    return lex_order_set
+
+
+def create_lex_order_map(
+        n_dims=None,
+        before_names=None,
+        after_names=None,
+        ):
+    """Return a map from each point in a lexicographic ordering to every
+        point that occurs later in the lexicographic ordering.
+
+    :arg n_dims: An :class:`int` representing the number of dimensions
+        in the lexicographic ordering. If not provided, `n_dims` will be
+        set to length of `after_names`.
+
+    :arg before_names: A list of :class:`str` variable names to be used
+        to describe lexicographic space dimensions for a point in a lexicographic
+        ordering that occurs before another point, which will be represented using
+        `after_names`. (see example below)
+
+    :arg after_names: A list of :class:`str` variable names to be used
+        to describe lexicographic space dimensions for a point in a lexicographic
+        ordering that occurs after another point, which will be represented using
+        `before_names`. (see example below)
+
+    :returns: An :class:`islpy.Map` representing a lexicographic
+        ordering as a mapping from each point in lexicographic time
+        to every point that occurs later in lexicographic time.
+        E.g., if `before_names = [i0', i1', i2']` and
+        `after_names = [i0, i1, i2]`, return the map::
+
+            {[i0', i1', i2'] -> [i0, i1, i2] :
+                i0' < i0 or (i0' = i0 and i1' < i1)
+                or (i0' = i0 and i1' = i1 and i2' < i2)}
+
+    """
+
+    if after_names is None:
+        after_names = ["i%s" % (i) for i in range(n_dims)]
+    if before_names is None:
+        from loopy.schedule.checker.utils import (
+            append_marker_to_strings,
+        )
+        before_names = append_marker_to_strings(after_names, marker="'")
+    if n_dims is None:
+        n_dims = len(after_names)
+
+    assert len(before_names) == len(after_names) == n_dims
+    dim_type = isl.dim_type
+
+    # First, get a set representing the lexicographic ordering.
+    lex_order_set = get_lex_order_set(before_names, after_names)
+
+    # Now convert that set to a map.
+    lex_map = isl.Map.from_domain(lex_order_set)
+    return lex_map.move_dims(
+        dim_type.out, 0, dim_type.in_,
+        len(before_names), len(after_names))
diff --git a/loopy/schedule/checker/schedule.py b/loopy/schedule/checker/schedule.py
index bc71df5d8f9658c788141e17a6eaf948cf5aa635..a947da3ac029b8b49868d65472d8eb893d79a946 100644
--- a/loopy/schedule/checker/schedule.py
+++ b/loopy/schedule/checker/schedule.py
@@ -313,3 +313,27 @@ def generate_pairwise_schedules(
         pairwise_schedules[tuple(insn_ids)] = tuple(sched_maps)
 
     return pairwise_schedules
+
+
+def get_lex_order_map_for_sched_space(schedule):
+    """Return an :class:`islpy.BasicMap` that maps each point in a
+        lexicographic ordering to every point that occurs later.
+
+    :arg schedule: A :class:`islpy.Map` representing the ordering of
+        statement instances as a mapping from statement instances to
+        lexicographic time.
+
+    :returns: An :class:`islpy.BasicMap` representing a lexicographic
+        ordering as a mapping from each point in lexicographic time
+        to every point that occurs later in lexicographic time, with
+        the dimension count and names matching the output dimension
+        of `schedule`.
+
+    """
+
+    from loopy.schedule.checker.lexicographic_order_map import (
+        create_lex_order_map,
+    )
+
+    lex_dim_names = schedule.space.get_var_names(isl.dim_type.out)
+    return create_lex_order_map(after_names=lex_dim_names)
diff --git a/loopy/schedule/checker/utils.py b/loopy/schedule/checker/utils.py
index 8e2a82a016202f054bb808887caa27c2f363842b..db1d861c8a76268a54468b12e9e9d77e016d58db 100644
--- a/loopy/schedule/checker/utils.py
+++ b/loopy/schedule/checker/utils.py
@@ -87,6 +87,35 @@ def ensure_dim_names_match_and_align(obj_map, tgt_map):
     return isl.align_spaces(obj_map, tgt_map)
 
 
+def append_marker_to_isl_map_var_names(old_isl_map, dim_type, marker="'"):
+    """Return an :class:`islpy.Map` with a marker appended to the specified
+    dimension names.
+
+    :arg old_isl_map: An :class:`islpy.Map`.
+
+    :arg dim_type: An :class:`islpy.dim_type`, i.e., an :class:`int`,
+        specifying the dimension to be marked.
+
+    :arg marker: A :class:`str` to be appended to the specified dimension
+        names. If not provided, `marker` defaults to an apostrophe.
+
+    :returns: An :class:`islpy.Map` matching `old_isl_map` with
+        `marker` appended to the `dim_type` dimension names.
+
+    """
+
+    new_map = old_isl_map.copy()
+    for i in range(len(old_isl_map.get_var_names(dim_type))):
+        new_map = new_map.set_dim_name(dim_type, i, old_isl_map.get_dim_name(
+            dim_type, i)+marker)
+    return new_map
+
+
+def append_marker_to_strings(strings, marker="'"):
+    assert isinstance(strings, list)
+    return [s+marker for s in strings]
+
+
 def sorted_union_of_names_in_isl_sets(
         isl_sets,
         set_dim=isl.dim_type.set):
diff --git a/loopy/statistics.py b/loopy/statistics.py
index eda750120bc8456e9090304cbd2905a02ff2358e..a0a0f9c7ed7c62e4ec8f6ca517809696abfd2a8d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1239,7 +1239,7 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
     l_used = set()
 
     from loopy.kernel.data import LocalIndexTag, GroupIndexTag
-    for iname in knl.insn_inames(insn):
+    for iname in insn.within_inames:
         tags = knl.iname_tags_of_type(iname,
                               (LocalIndexTag, GroupIndexTag), max_num=1)
         if tags:
@@ -1273,7 +1273,7 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
 
 def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
 
-    insn_inames = knl.insn_inames(insn)
+    insn_inames = insn.within_inames
 
     if disregard_local_axes:
         from loopy.kernel.data import LocalIndexTag
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 7e5de3164de761c01f41981a850bb14f6895c95d..77f8228b66a9af0e2cb500bb7d012887e9c94fcc 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1002,20 +1002,77 @@ class RuleAwareIdentityMapper(IdentityMapper):
                         lambda expr: self(expr, kernel, insn)))
                 for insn in kernel.instructions]
 
-        return kernel.copy(instructions=new_insns)
+        from functools import partial
+
+        non_insn_self = partial(self, kernel=kernel, insn=None)
+
+        from loopy.kernel.array import ArrayBase
+
+        # {{{ args
+
+        new_args = [
+                arg.map_exprs(non_insn_self) if isinstance(arg, ArrayBase) else arg
+                for arg in kernel.args]
+
+        # }}}
+
+        # {{{ tvs
+
+        new_tvs = {
+                tv_name: tv.map_exprs(non_insn_self)
+                for tv_name, tv in kernel.temporary_variables.items()}
+
+        # }}}
+
+        # domains, var names: not exprs => do not map
+
+        return kernel.copy(instructions=new_insns,
+                           args=new_args,
+                           temporary_variables=new_tvs)
 
 
 class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
+    """
+    Mapper to substitute expressions and record any divergence of substitution
+    rule expressions of :class:`loopy.LoopKernel`.
+
+    .. attribute:: rule_mapping_context
+
+        An instance of :class:`SubstitutionRuleMappingContext` to record
+        divergence of substitution rules.
+
+    .. attribute:: within
+
+        An instance of :class:`loopy.match.StackMatchComponent`.
+        :class:`RuleAwareSubstitutionMapper` would perform
+        substitutions in the expression if the stack match is ``True`` or
+        if the expression does not arise from an :class:`~loopy.InstructionBase`.
+
+    .. note::
+
+        The mapped kernel should be passed through
+        :meth:`SubstitutionRuleMappingContext.finish_kernel` to perform any
+        renaming mandated by the rule expression divergences.
+    """
     def __init__(self, rule_mapping_context, subst_func, within):
         super().__init__(rule_mapping_context)
 
         self.subst_func = subst_func
-        self.within = within
+        self._within = within
+
+    def within(self, kernel, instruction, stack):
+        if instruction is None:
+            # always perform substitutions on expressions not coming from
+            # instructions.
+            return True
+        else:
+            return self._within(kernel, instruction, stack)
 
     def map_variable(self, expr, expn_state):
         if (expr.name in expn_state.arg_context
                 or not self.within(
                     expn_state.kernel, expn_state.instruction, expn_state.stack)):
+            # expr not in within => do nothing (call IdentityMapper)
             return super().map_variable(
                     expr, expn_state)
 
@@ -1525,7 +1582,13 @@ def qpolynomial_from_expr(space, expr):
 def simplify_using_aff(kernel, expr):
     inames = get_dependencies(expr) & kernel.all_inames()
 
-    domain = kernel.get_inames_domain(inames)
+    # FIXME: Ideally, we should find out what inames are usable and allow
+    # the simplification to use all of those. For now, fall back to making
+    # sure that the simplification only uses inames that were already there.
+    domain = (
+            kernel
+            .get_inames_domain(inames)
+            .project_out_except(inames, [dim_type.set]))
 
     try:
         aff = guarded_aff_from_expr(domain.space, expr)
@@ -1679,6 +1742,25 @@ def isl_set_from_expr(space, expr):
 
     return set_
 
+
+def condition_to_set(space, expr):
+    """
+    Returns an instance of :class:`islpy.Set` if *expr* can be expressed as an
+    ISL-set on *space*, if not then returns *None*.
+    """
+    from loopy.symbolic import get_dependencies
+    if get_dependencies(expr) <= frozenset(
+            space.get_var_dict()):
+        try:
+            from loopy.symbolic import isl_set_from_expr
+            return isl_set_from_expr(space, expr)
+        except ExpressionToAffineConversionError:
+            # non-affine condition: can't do much
+            return None
+    else:
+        # data-dependent condition: can't do much
+        return None
+
 # }}}
 
 
@@ -2036,7 +2118,7 @@ class AccessRangeOverlapChecker:
         arm = BatchedAccessRangeMapper(self.kernel, self.vars, overestimate=True)
 
         for expr in exprs:
-            arm(expr, self.kernel.insn_inames(insn))
+            arm(expr, insn.within_inames)
 
         for name, arange in arm.access_ranges.items():
             if arm.bad_subscripts[name]:
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 6bad214ec4e10a91e36b3566f454eabab00dde26..8af47c41222416fbd2dbe3dc5a88d4090a4a06f0 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -39,6 +39,14 @@ __doc__ = """
 .. autoclass:: NumbaTarget
 .. autoclass:: NumbaCudaTarget
 
+References to Canonical Names
+-----------------------------
+
+.. currentmodule:: loopy.target
+
+.. class:: TargetBase
+
+    See :class:`loopy.TargetBase`.
 """
 
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 3234da45d469e2cc71de4733f3186aea8a93b065..d1e474c2054a15688f00b2bd5f5c6d9e6e9975df 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -34,6 +34,9 @@ from loopy.symbolic import IdentityMapper
 from loopy.types import NumpyType
 import pymbolic.primitives as p
 
+from loopy.tools import remove_common_indentation
+import re
+
 from pytools import memoize_method
 
 __doc__ = """
@@ -172,6 +175,46 @@ def _preamble_generator(preamble_info):
             yield ("04_%s" % func_name, func_body)
             yield undef_integer_types_macro
 
+    for func in preamble_info.seen_functions:
+        if func.name == "int_pow":
+            base_ctype = preamble_info.kernel.target.dtype_to_typename(
+                    func.arg_dtypes[0])
+            exp_ctype = preamble_info.kernel.target.dtype_to_typename(
+                    func.arg_dtypes[1])
+            res_ctype = preamble_info.kernel.target.dtype_to_typename(
+                    func.result_dtypes[0])
+
+            if func.arg_dtypes[1].numpy_dtype.kind == "u":
+                signed_exponent_preamble = ""
+            else:
+                signed_exponent_preamble = "\n" + remove_common_indentation(
+                        """
+                        if (n < 0) {
+                          x = 1.0/x;
+                          n =  -n;
+                        }""")
+
+            yield(f"07_{func.c_name}", f"""
+            inline {res_ctype} {func.c_name}({base_ctype} x, {exp_ctype} n) {{
+              if (n == 0)
+                return 1;
+              {re.sub("^", 14*" ", signed_exponent_preamble, flags=re.M)}
+
+              {res_ctype} y = 1;
+
+              while (n > 1) {{
+                if (n % 2) {{
+                  y = x * y;
+                  x = x * x;
+                }}
+                else
+                  x = x * x;
+                n = n / 2;
+              }}
+
+              return x*y;
+            }}""")
+
 # }}}
 
 
@@ -447,14 +490,14 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
                 arg_dtypes=arg_dtypes)
 
     # binary functions
-    if (name in ["fmax", "fmin", "copysign"]
+    if (name in ["fmax", "fmin", "copysign", "pow"]
             and len(arg_dtypes) == 2):
 
         dtype = np.find_common_type(
             [], [dtype.numpy_dtype for dtype in arg_dtypes])
 
         if dtype.kind == "c":
-            raise LoopyTypeError("%s does not support complex numbers")
+            raise LoopyTypeError(f"{name} does not support complex numbers")
 
         elif dtype.kind == "f":
             if modify_name:
@@ -942,7 +985,8 @@ class CFamilyASTBuilder(ASTBuilderBase):
         codegen_state.seen_functions.add(
                 SeenFunction(func_id,
                     mangle_result.target_name,
-                    mangle_result.arg_dtypes))
+                    mangle_result.arg_dtypes,
+                    mangle_result.result_dtypes))
 
         from pymbolic import var
         for i, (a, tgt_dtype) in enumerate(
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 74f1ead8bcb3240f2cf3775048f7cc809e367a1f..9ec99c784f5955232038644de6ee06dd6466237a 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -325,7 +325,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             self.codegen_state.seen_functions.add(
                     SeenFunction(
                         name, f"{name}_{suffix}",
-                        (result_dtype, result_dtype)))
+                        (result_dtype, result_dtype),
+                        (result_dtype,)))
 
         if den_nonneg:
             if num_nonneg:
@@ -538,7 +539,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         self.codegen_state.seen_functions.add(
                 SeenFunction(identifier,
                     mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
+                    mangle_result.arg_dtypes or par_dtypes,
+                    mangle_result.result_dtypes))
 
         return var(mangle_result.target_name)(*processed_parameters)
 
@@ -701,6 +703,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                     self.rec(expr.denominator, type_context, tgt_dtype))
 
     def map_power(self, expr, type_context):
+        tgt_dtype = self.infer_type(expr)
+        base_dtype = self.infer_type(expr.base)
+        exponent_dtype = self.infer_type(expr.exponent)
+
         def base_impl(expr, type_context):
             from pymbolic.primitives import is_constant, is_zero
             if is_constant(expr.exponent):
@@ -711,14 +717,24 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 elif is_zero(expr.exponent - 2):
                     return self.rec(expr.base*expr.base, type_context)
 
-            return type(expr)(
-                    self.rec(expr.base, type_context),
-                    self.rec(expr.exponent, type_context))
+            if exponent_dtype.is_integral():
+                from loopy.codegen import SeenFunction
+                func_name = ("loopy_pow_"
+                        f"{tgt_dtype.numpy_dtype}_{exponent_dtype.numpy_dtype}")
+
+                self.codegen_state.seen_functions.add(
+                        SeenFunction(
+                            "int_pow", func_name,
+                            (tgt_dtype, exponent_dtype),
+                            (tgt_dtype, )))
+                return var(func_name)(self.rec(expr.base, type_context),
+                                      self.rec(expr.exponent, type_context))
+            else:
+                return self.rec(var("pow")(expr.base, expr.exponent), type_context)
 
         if not self.allow_complex:
             return base_impl(expr, type_context)
 
-        tgt_dtype = self.infer_type(expr)
         if tgt_dtype.is_complex():
             if expr.exponent in [2, 3, 4]:
                 value = expr.base
@@ -726,8 +742,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                     value = value * expr.base
                 return self.rec(value, type_context)
             else:
-                b_complex = self.infer_type(expr.base).is_complex()
-                e_complex = self.infer_type(expr.exponent).is_complex()
+                b_complex = base_dtype.is_complex()
+                e_complex = exponent_dtype.is_complex()
 
                 if b_complex and not e_complex:
                     return var("%s_powr" % self.complex_type_name(tgt_dtype))(
@@ -754,6 +770,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 # {{{ C expression to code mapper
 
 class CExpressionToCodeMapper(RecursiveMapper):
+
     # {{{ helpers
 
     def parenthesize_if_needed(self, s, enclosing_prec, my_prec):
@@ -954,9 +971,8 @@ class CExpressionToCodeMapper(RecursiveMapper):
         return self._map_division_operator("%", expr, enclosing_prec)
 
     def map_power(self, expr, enclosing_prec):
-        return "pow({}, {})".format(
-                self.rec(expr.base, PREC_NONE),
-                self.rec(expr.exponent, PREC_NONE))
+        raise RuntimeError(f"'{expr}' should have been transformed to 'Call'"
+                           " expression node.")
 
     def map_array_literal(self, expr, enclosing_prec):
         return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE)
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 2023077bf8f286d9c28cdee2e37f194276dc211a..67dc1fe249af91d9b73a7162867dcd98c7ef6bc7 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -127,6 +127,18 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
 
         return dtype, name
 
+    if name in ["pow"] and len(arg_dtypes) == 2:
+        dtype = np.find_common_type([], arg_dtypes)
+
+        if dtype == np.float64:
+            pass  # pow
+        elif dtype == np.float32:
+            name = name + "f"  # powf
+        else:
+            raise RuntimeError(f"{name} does not support type {dtype}")
+
+        return dtype, name
+
     if name in "atan2" and len(arg_dtypes) == 2:
         return arg_dtypes[0], name
 
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 74819b93932e0852a59c3ebacb99f9eaafab0a05..74887155b920e6d514df673c1ed8897486a4f81f 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -281,20 +281,20 @@ class ExecutionWrapperGeneratorBase:
                                     'passed array")'
                                     % (arg.name, impl_array_name))
 
-                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                        if not options.skip_arg_checks:
-                            gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
-                                    % (arg.name, impl_array_name, stride_impl_axis,
-                                        base_arg.dtype.dtype.itemsize))
+                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
 
-                            gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
-                                    ' is not divisible by its dtype itemsize"'
-                                    % (stride_impl_axis, impl_array_name))
-                            gen("del _lpy_remdr")
-                        else:
-                            gen("%s = _lpy_offset // %d"
-                                    % (arg.name, base_arg.dtype.itemsize))
+                    if not options.skip_arg_checks:
+                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
+                                % (arg.name, impl_array_name, stride_impl_axis,
+                                    base_arg.dtype.dtype.itemsize))
+
+                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
+                                ' is not divisible by its dtype itemsize"'
+                                % (stride_impl_axis, impl_array_name))
+                        gen("del _lpy_remdr")
+                    else:
+                        gen("%s = _lpy_offset // %d"
+                                % (arg.name, base_arg.dtype.itemsize))
 
         gen("# }}}")
         gen("")
@@ -639,8 +639,6 @@ class ExecutionWrapperGeneratorBase:
                     if issubclass(idi.arg_class, KernelArgument)
                     ])
 
-        gen.add_to_preamble("from __future__ import division")
-        gen.add_to_preamble("")
         self.target_specific_preamble(gen)
         gen.add_to_preamble("")
         self.generate_host_code(gen, codegen_result)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 2ff9ede55e8c3ab5b5e1237b2a66c72635e1454b..c409df380c5a6b1e47cfcc9773aee2bee16ba1a8 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -28,7 +28,7 @@ import numpy as np
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from pytools import memoize_method
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
 from loopy.kernel.data import AddressSpace, CallMangleInfo
@@ -181,6 +181,22 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
                     result_dtypes=(result_dtype,),
                     arg_dtypes=2*(result_dtype,))
 
+    if name == "pow" and len(arg_dtypes) == 2:
+        dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for dtype in arg_dtypes])
+        if dtype == np.float64:
+            name = "powf64"
+        elif dtype == np.float32:
+            name = "powf32"
+        else:
+            raise LoopyTypeError(f"'pow' does not support type {dtype}.")
+
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=2*(result_dtype,))
+
     if name == "dot":
         scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
         return CallMangleInfo(
@@ -286,6 +302,19 @@ def opencl_preamble_generator(preamble_info):
                 """ % dict(idx_ctype=kernel.target.dtype_to_typename(
                     kernel.index_dtype))))
 
+    for func in preamble_info.seen_functions:
+        if func.name == "pow" and func.c_name == "powf32":
+            yield("08_clpowf32", """
+            inline float powf32(float x, float y) {
+              return pow(x, y);
+            }""")
+
+        if func.name == "pow" and func.c_name == "powf64":
+            yield("08_clpowf64", """
+            inline double powf64(double x, double y) {
+              return pow(x, y);
+            }""")
+
 # }}}
 
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index a17416c47bb290285972390ae161771bac8f77e9..8d0c309b08b8df4cda7e13c097441ef272449a02 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -509,14 +509,6 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
                 Raise('RuntimeError("input argument \'{name}\' '
                         'must be supplied")'.format(name=idi.name))))
 
-        if idi.dtype.is_integral():
-            gen(Comment("cast to Python int to avoid trouble "
-                "with struct packing or Boost.Python"))
-            py_type = "int"
-
-            gen(Assign(idi.name, f"{py_type}({idi.name})"))
-            gen(Line())
-
         if idi.dtype.is_composite():
             gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
             cl_arg_idx += 1
@@ -578,7 +570,7 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
                 fp_arg_count += 1
 
             gen(S(
-                "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
+                "_lpy_knl._set_arg_buf(%d, _lpy_pack('%s', %s))"
                 % (cl_arg_idx, idi.dtype.dtype.char, idi.name)))
 
             cl_arg_idx += 1
@@ -632,25 +624,22 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
                     if not issubclass(idi.arg_class, TemporaryVariable)]
                 + ["wait_for=None", "allocator=None"])
 
-        from genpy import (For, Function, Suite, Import, ImportAs, Return,
-                FromImport, Line, Statement as S)
+        from genpy import (For, Function, Suite, Return, Line, Statement as S)
         return Function(
                 codegen_result.current_program(codegen_state).name,
                 args,
                 Suite([
-                    FromImport("struct", ["pack as _lpy_pack"]),
-                    ImportAs("pyopencl", "_lpy_cl"),
-                    Import("pyopencl.tools"),
                     Line(),
                     ] + [
                     Line(),
                     function_body,
                     Line(),
-                    ] + [
-                    For("_tv", "_global_temporaries",
-                        # free global temporaries
-                        S("_tv.release()"))
-                    ] + [
+                    ] + ([
+                        For("_tv", "_global_temporaries",
+                            # free global temporaries
+                            S("_tv.release()"))
+                        ] if self._get_global_temporaries(codegen_state) else []
+                    ) + [
                     Line(),
                     Return("_lpy_evt"),
                     ]))
@@ -660,6 +649,14 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         # no such thing in Python
         return None
 
+    def _get_global_temporaries(self, codegen_state):
+        from loopy.kernel.data import AddressSpace
+
+        return sorted(
+            (tv for tv in codegen_state.kernel.temporary_variables.values()
+            if tv.address_space == AddressSpace.GLOBAL),
+            key=lambda tv: tv.name)
+
     def get_temporary_decls(self, codegen_state, schedule_state):
         from genpy import Assign, Comment, Line
 
@@ -668,18 +665,12 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
             from operator import mul
             return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
 
-        from loopy.kernel.data import AddressSpace
-
-        global_temporaries = sorted(
-            (tv for tv in codegen_state.kernel.temporary_variables.values()
-            if tv.address_space == AddressSpace.GLOBAL),
-            key=lambda tv: tv.name)
-
         from pymbolic.mapper.stringifier import PREC_NONE
         ecm = self.get_expression_to_code_mapper(codegen_state)
 
+        global_temporaries = self._get_global_temporaries(codegen_state)
         if not global_temporaries:
-            return [Assign("_global_temporaries", "[]"), Line()]
+            return []
 
         return [
             Comment("{{{ allocate global temporaries"),
@@ -734,8 +725,13 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
             arry_arg_code,
             Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel("
                 "queue, _lpy_knl, "
-                "%(gsize)s, %(lsize)s,  wait_for=wait_for, "
-                "g_times_l=True, allow_empty_ndrange=True)"
+                "%(gsize)s, %(lsize)s, "
+                # using positional args because pybind is slow with kwargs
+                "None, "  # offset
+                "wait_for, "
+                "True, "  # g_times_l
+                "True, "  # allow_empty_ndrange
+                ")"
                 % dict(
                     pyopencl_module_name=self.target.pyopencl_module_name,
                     gsize=ecm(gsize, prec=PREC_NONE, type_context="i"),
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 7fc20f19167af62f86e9fb18690b2f03f932e63b..cdee5600bb5dd0dce3a3971583604f737c6913d9 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -142,6 +142,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen.add_to_preamble("import pyopencl as _lpy_cl")
         gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
         gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
+        gen.add_to_preamble("from struct import pack as _lpy_pack")
 
     def initialize_system_args(self, gen):
         """
diff --git a/loopy/target/python.py b/loopy/target/python.py
index e54aa622f0b56360cb1b3f04be118c1319db7d3b..a1557e47bdf8990e7aa89472b59f3c9fc3666a05 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -118,7 +118,8 @@ class ExpressionToPythonMapper(StringifyMapper):
         self.codegen_state.seen_functions.add(
                 SeenFunction(identifier,
                     mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
+                    mangle_result.arg_dtypes or par_dtypes,
+                    mangle_result.result_dtypes))
 
         return "{}({})".format(mangle_result.target_name, ", ".join(str_parameters))
 
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index a50725d20d579109f6e061fba0a1f408a6e23e93..e946a67c0cf067b4701a5ab4bcd86594d42c5b4c 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -631,6 +631,8 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
         raise LoopyError("argument name '%s' conflicts with an existing identifier"
                 "--cannot rename" % new_name)
 
+    # {{{ instructions
+
     from pymbolic import var
     subst_dict = {old_name: var(new_name)}
 
@@ -644,7 +646,11 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
                     make_subst_func(subst_dict),
                     within=lambda kernel, insn, stack: True)
 
-    kernel = smap.map_kernel(kernel)
+    kernel = rule_mapping_context.finish_kernel(smap.map_kernel(kernel))
+
+    # }}}
+
+    # {{{ args
 
     new_args = []
     for arg in kernel.args:
@@ -653,7 +659,22 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
 
         new_args.append(arg)
 
-    return kernel.copy(args=new_args)
+    # }}}
+
+    # {{{ domain
+
+    new_domains = []
+    for dom in kernel.domains:
+        dom_var_dict = dom.get_var_dict()
+        if old_name in dom_var_dict:
+            dt, pos = dom_var_dict[old_name]
+            dom = dom.set_dim_name(dt, pos, new_name)
+
+        new_domains.append(dom)
+
+    # }}}
+
+    return kernel.copy(domains=new_domains, args=new_args)
 
 # }}}
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 241c1492d4c41124c21befb2739fae349538c908..fb5e8d781ebc3f8c806dfa7b531560f0855c98d5 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -118,25 +118,25 @@ def prioritize_loops(kernel, loop_priority):
 
 class _InameSplitter(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, within,
-            split_iname, outer_iname, inner_iname, replacement_index):
+            iname_to_split, outer_iname, inner_iname, replacement_index):
         super().__init__(rule_mapping_context)
 
         self.within = within
 
-        self.split_iname = split_iname
+        self.iname_to_split = iname_to_split
         self.outer_iname = outer_iname
         self.inner_iname = inner_iname
 
         self.replacement_index = replacement_index
 
     def map_reduction(self, expr, expn_state):
-        if (self.split_iname in expr.inames
-                and self.split_iname not in expn_state.arg_context
+        if (self.iname_to_split in expr.inames
+                and self.iname_to_split not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
                     expn_state.instruction)):
             new_inames = list(expr.inames)
-            new_inames.remove(self.split_iname)
+            new_inames.remove(self.iname_to_split)
             new_inames.extend([self.outer_iname, self.inner_iname])
 
             from loopy.symbolic import Reduction
@@ -147,8 +147,8 @@ class _InameSplitter(RuleAwareIdentityMapper):
             return super().map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
-        if (expr.name == self.split_iname
-                and self.split_iname not in expn_state.arg_context
+        if (expr.name == self.iname_to_split
+                and self.iname_to_split not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
                     expn_state.instruction)):
@@ -157,7 +157,58 @@ class _InameSplitter(RuleAwareIdentityMapper):
             return super().map_variable(expr, expn_state)
 
 
-def _split_iname_backend(kernel, split_iname,
+def _split_iname_in_set(s, iname_to_split, inner_iname, outer_iname, fixed_length,
+        fixed_length_is_inner):
+    var_dict = s.get_var_dict()
+
+    if iname_to_split not in var_dict:
+        return s
+
+    orig_dim_type, _ = var_dict[iname_to_split]
+    # orig_dim_type may be set or param (the latter if the iname is
+    # used as a parameter in a subdomain).
+
+    # NB: dup_iname_to_split is not a globally valid identifier: only unique
+    # wrt the set s.
+    from pytools import generate_unique_names
+    for dup_iname_to_split in generate_unique_names(f"dup_{iname_to_split}"):
+        if dup_iname_to_split not in var_dict:
+            break
+
+    from loopy.isl_helpers import duplicate_axes
+    s = duplicate_axes(s, (iname_to_split,), (dup_iname_to_split,))
+
+    outer_var_nr = s.dim(orig_dim_type)
+    inner_var_nr = s.dim(orig_dim_type)+1
+
+    s = s.add_dims(orig_dim_type, 2)
+    s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname)
+    s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname)
+
+    from loopy.isl_helpers import make_slab
+
+    if fixed_length_is_inner:
+        fixed_iname, var_length_iname = inner_iname, outer_iname
+    else:
+        fixed_iname, var_length_iname = outer_iname, inner_iname
+
+    space = s.get_space()
+    s = s & (
+            make_slab(space, fixed_iname, 0, fixed_length)
+            # name = fixed_iname + fixed_length*var_length_iname
+            .add_constraint(isl.Constraint.eq_from_names(
+                space, {
+                    dup_iname_to_split: 1,
+                    fixed_iname: -1,
+                    var_length_iname: -fixed_length})))
+
+    dup_iname_dim_type, dup_name_idx = space.get_var_dict()[dup_iname_to_split]
+    s = s.project_out(dup_iname_dim_type, dup_name_idx, 1)
+
+    return s
+
+
+def _split_iname_backend(kernel, iname_to_split,
         fixed_length, fixed_length_is_inner,
         make_new_loop_index,
         outer_iname=None, inner_iname=None,
@@ -186,88 +237,47 @@ def _split_iname_backend(kernel, split_iname,
 
     # }}}
 
-    existing_tags = kernel.iname_tags(split_iname)
+    existing_tags = kernel.iname_tags(iname_to_split)
     from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type
     if (do_tagged_check and existing_tags
             and not filter_iname_tags_by_type(existing_tags, ForceSequentialTag)):
-        raise LoopyError("cannot split already tagged iname '%s'" % split_iname)
+        raise LoopyError(f"cannot split already tagged iname '{iname_to_split}'")
 
-    if split_iname not in kernel.all_inames():
-        raise ValueError("cannot split loop for unknown variable '%s'" % split_iname)
+    if iname_to_split not in kernel.all_inames():
+        raise ValueError(
+                f"cannot split loop for unknown variable '{iname_to_split}'")
 
     applied_iname_rewrites = kernel.applied_iname_rewrites[:]
 
     vng = kernel.get_var_name_generator()
 
     if outer_iname is None:
-        outer_iname = vng(split_iname+"_outer")
+        outer_iname = vng(iname_to_split+"_outer")
     if inner_iname is None:
-        inner_iname = vng(split_iname+"_inner")
-
-    def process_set(s):
-        var_dict = s.get_var_dict()
-
-        if split_iname not in var_dict:
-            return s
-
-        orig_dim_type, _ = var_dict[split_iname]
+        inner_iname = vng(iname_to_split+"_inner")
 
-        outer_var_nr = s.dim(orig_dim_type)
-        inner_var_nr = s.dim(orig_dim_type)+1
-
-        s = s.add_dims(orig_dim_type, 2)
-        s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname)
-        s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname)
-
-        from loopy.isl_helpers import make_slab
-
-        if fixed_length_is_inner:
-            fixed_iname, var_length_iname = inner_iname, outer_iname
-        else:
-            fixed_iname, var_length_iname = outer_iname, inner_iname
-
-        space = s.get_space()
-        fixed_constraint_set = (
-                make_slab(space, fixed_iname, 0, fixed_length)
-                # name = fixed_iname + fixed_length*var_length_iname
-                .add_constraint(isl.Constraint.eq_from_names(
-                    space, {
-                        split_iname: 1,
-                        fixed_iname: -1,
-                        var_length_iname: -fixed_length})))
-
-        name_dim_type, name_idx = space.get_var_dict()[split_iname]
-        s = s.intersect(fixed_constraint_set)
-
-        def _project_out_only_if_all_instructions_in_within():
-            for insn in kernel.instructions:
-                if split_iname in insn.within_inames and (
-                        not within(kernel, insn)):
-                    return s
-
-            return s.project_out(name_dim_type, name_idx, 1)
-
-        return _project_out_only_if_all_instructions_in_within()
-
-    new_domains = [process_set(dom) for dom in kernel.domains]
+    new_domains = [
+            _split_iname_in_set(dom, iname_to_split, inner_iname, outer_iname,
+                fixed_length, fixed_length_is_inner)
+            for dom in kernel.domains]
 
     from pymbolic import var
     inner = var(inner_iname)
     outer = var(outer_iname)
     new_loop_index = make_new_loop_index(inner, outer)
 
-    subst_map = {var(split_iname): new_loop_index}
+    subst_map = {var(iname_to_split): new_loop_index}
     applied_iname_rewrites.append(subst_map)
 
     # {{{ update within_inames
 
     new_insns = []
     for insn in kernel.instructions:
-        if split_iname in insn.within_inames and (
+        if iname_to_split in insn.within_inames and (
                 within(kernel, insn)):
             new_within_inames = (
                     (insn.within_inames.copy()
-                    - frozenset([split_iname]))
+                    - frozenset([iname_to_split]))
                     | frozenset([outer_iname, inner_iname]))
         else:
             new_within_inames = insn.within_inames
@@ -286,7 +296,7 @@ def _split_iname_backend(kernel, split_iname,
     for prio in kernel.loop_priority:
         new_prio = ()
         for prio_iname in prio:
-            if prio_iname == split_iname:
+            if prio_iname == iname_to_split:
                 new_prio = new_prio + (outer_iname, inner_iname)
             else:
                 new_prio = new_prio + (prio_iname,)
@@ -302,7 +312,7 @@ def _split_iname_backend(kernel, split_iname,
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
     ins = _InameSplitter(rule_mapping_context, within,
-            split_iname, outer_iname, inner_iname, new_loop_index)
+            iname_to_split, outer_iname, inner_iname, new_loop_index)
 
     kernel = ins.map_kernel(kernel)
     kernel = rule_mapping_context.finish_kernel(kernel)
@@ -311,7 +321,10 @@ def _split_iname_backend(kernel, split_iname,
         kernel = tag_inames(kernel,
                 {outer_iname: existing_tag, inner_iname: existing_tag})
 
-    return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
+    kernel = tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
+    kernel = remove_unused_inames(kernel, [iname_to_split])
+
+    return kernel
 
 # }}}
 
@@ -319,6 +332,7 @@ def _split_iname_backend(kernel, split_iname,
 # {{{ split iname
 
 def split_iname(kernel, split_iname, inner_length,
+        *,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
         slabs=(0, 0), do_tagged_check=True,
@@ -1197,16 +1211,22 @@ def remove_unused_inames(kernel, inames=None):
 
     # {{{ remove them
 
-    from loopy.kernel.tools import DomainChanger
-
+    domains = kernel.domains
     for iname in unused_inames:
-        domch = DomainChanger(kernel, (iname,))
+        new_domains = []
+
+        for dom in domains:
+            try:
+                dt, idx = dom.get_var_dict()[iname]
+            except KeyError:
+                pass
+            else:
+                dom = dom.project_out(dt, idx, 1)
+            new_domains.append(dom)
 
-        dom = domch.domain
-        dt, idx = dom.get_var_dict()[iname]
-        dom = dom.project_out(dt, idx, 1)
+        domains = new_domains
 
-        kernel = kernel.copy(domains=domch.get_domains_with(dom))
+    kernel = kernel.copy(domains=domains)
 
     # }}}
 
@@ -1589,7 +1609,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
     insns = [insn for insn in kernel.instructions if match(kernel, insn)]
 
     for insn in insns:
-        for iname in kernel.insn_inames(insn):
+        for iname in insn.within_inames:
             if kernel.iname_tags_of_type(iname, kind):
                 used_axes.add(kind.axis)
 
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 8527023bc789c9b3c9e18fe7ad6827c82a6e7a55..ce2d7942b70c68a79fd5c6ddc36b24fd6896cc04 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -124,7 +124,7 @@ def privatize_temporaries_with_inames(
         for writer_insn_id in wmap.get(tv.name, []):
             writer_insn = kernel.id_to_insn[writer_insn_id]
 
-            priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames
+            priv_axis_inames = writer_insn.within_inames & privatizing_inames
 
             referenced_priv_axis_inames = (priv_axis_inames
                     & writer_insn.write_dependency_names())
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 64337864f48e42f096ab851dd5b71afd607f067e..787966efc7fd00ad282e60990846ce07004e7906 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -216,8 +216,12 @@ class TypeInferenceMapper(CombineMapper):
             # Numpy types are sized
             return [NumpyType(np.dtype(type(expr)))]
         elif dt.kind == "f":
-            # deduce the smaller type by default
-            return [NumpyType(np.dtype(np.float32))]
+            if np.float32(expr) == np.float64(expr):
+                # No precision is lost by 'guessing' single precision, use that.
+                # This at least covers simple cases like '1j'.
+                return [NumpyType(np.dtype(np.float32))]
+
+            return [NumpyType(np.dtype(np.float64))]
         elif dt.kind == "c":
             if np.complex64(expr) == np.complex128(expr):
                 # (COMPLEX_GUESS_LOGIC)
diff --git a/loopy/version.py b/loopy/version.py
index fddd44479adcae87ec96f470a690274b154fde54..6f66c5347c55042ebf7b220a658bb4ebf3fef04d 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@ else:
 # }}}
 
 
-VERSION = (2020, 2, 1)
+VERSION = (2020, 2, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
diff --git a/requirements.txt b/requirements.txt
index 2105aede063c65752ef4a9262eb960f749778a8a..8016ee7a86fbb4646d534bd66182f563b2cc9a44 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-git+https://github.com/inducer/pytools.git#egg=pytools
+git+https://github.com/inducer/pytools.git#egg=pytools >= 2021.1
 git+https://github.com/inducer/islpy.git#egg=islpy
 git+https://github.com/inducer/cgen.git#egg=cgen
 git+https://github.com/inducer/pyopencl.git#egg=pyopencl
@@ -6,7 +6,7 @@ git+https://github.com/inducer/pymbolic.git#egg=pymbolic
 git+https://github.com/inducer/genpy.git#egg=genpy
 git+https://github.com/inducer/codepy.git#egg=codepy
 
-git+https://github.com/inducer/f2py
+git+https://github.com/inducer/f2py#egg=f2py
 
 # Optional, needed for using the C preprocessor on Fortran
 ply>=3.6
diff --git a/setup.py b/setup.py
index ddc47fefca853321d383bad4aeaa6f24f6d5c901..fcf284bc8574dc118e4b319c1b9ff38b0b24685d 100644
--- a/setup.py
+++ b/setup.py
@@ -84,7 +84,7 @@ setup(name="loopy",
 
       python_requires="~=3.6",
       install_requires=[
-          "pytools>=2020.4",
+          "pytools>=2021.1",
           "pymbolic>=2019.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_linearization_checker.py b/test/test_linearization_checker.py
index 3c927a9cea09df50c4d0fe70dee7435b6ce3c129..56882416b8c361e09074b41d7af5b96cdcb90d2f 100644
--- a/test/test_linearization_checker.py
+++ b/test/test_linearization_checker.py
@@ -43,7 +43,9 @@ from loopy.schedule.checker.schedule import (
 logger = logging.getLogger(__name__)
 
 
-def test_lexschedule_creation():
+# {{{ test pairwise schedule creation
+
+def test_pairwise_schedule_creation():
     import islpy as isl
     from loopy.schedule.checker import (
         get_schedules_for_statement_pairs,
@@ -296,6 +298,272 @@ def test_lexschedule_creation():
     assert sched_map_before == sched_map_before_expected
     assert sched_map_after == sched_map_after_expected
 
+# }}}
+
+
+# {{{ test lex order map creation
+
+def test_lex_order_map_creation():
+    import islpy as isl
+    from loopy.schedule.checker.lexicographic_order_map import (
+        create_lex_order_map,
+    )
+    from loopy.schedule.checker.utils import (
+        append_marker_to_isl_map_var_names,
+    )
+
+    def _check_lex_map(expected_lex_order_map, n_dims):
+        # Isl ignores the apostrophes, so explicitly add them
+        expected_lex_order_map = append_marker_to_isl_map_var_names(
+            expected_lex_order_map, isl.dim_type.in_, "'")
+
+        lex_order_map = create_lex_order_map(
+            n_dims=n_dims,
+            before_names=["%s%d'" % (LEX_VAR_PREFIX, i) for i in range(n_dims)],
+            after_names=["%s%d" % (LEX_VAR_PREFIX, i) for i in range(n_dims)],
+            )
+
+        assert lex_order_map == expected_lex_order_map
+        assert (
+            lex_order_map.get_var_names(isl.dim_type.in_) ==
+            expected_lex_order_map.get_var_names(isl.dim_type.in_))
+        assert (
+            lex_order_map.get_var_names(isl.dim_type.out) ==
+            expected_lex_order_map.get_var_names(isl.dim_type.out))
+
+    expected_lex_order_map = isl.Map(
+        "{{ "
+        "[{0}0', {0}1', {0}2', {0}3', {0}4'] -> [{0}0, {0}1, {0}2, {0}3, {0}4] :"
+        "("
+        "{0}0' < {0}0 "
+        ") or ("
+        "{0}0'={0}0 and {0}1' < {0}1 "
+        ") or ("
+        "{0}0'={0}0 and {0}1'={0}1 and {0}2' < {0}2 "
+        ") or ("
+        "{0}0'={0}0 and {0}1'={0}1 and {0}2'={0}2 and {0}3' < {0}3 "
+        ") or ("
+        "{0}0'={0}0 and {0}1'={0}1 and {0}2'={0}2 and {0}3'={0}3 and {0}4' < {0}4"
+        ")"
+        "}}".format(LEX_VAR_PREFIX))
+
+    _check_lex_map(expected_lex_order_map, 5)
+
+    expected_lex_order_map = isl.Map(
+        "{{ "
+        "[{0}0'] -> [{0}0] :"
+        "("
+        "{0}0' < {0}0 "
+        ")"
+        "}}".format(LEX_VAR_PREFIX))
+
+    _check_lex_map(expected_lex_order_map, 1)
+
+# }}}
+
+
+# {{{ test statement instance ordering creation
+
+def test_statement_instance_ordering_creation():
+    import islpy as isl
+    from loopy.schedule.checker import (
+        get_schedules_for_statement_pairs,
+    )
+    from loopy.schedule.checker.schedule import (
+        get_lex_order_map_for_sched_space,
+    )
+    from loopy.schedule.checker.utils import (
+        ensure_dim_names_match_and_align,
+        append_marker_to_isl_map_var_names,
+    )
+    from loopy.schedule.checker.lexicographic_order_map import (
+        get_statement_ordering_map,
+        create_lex_order_map,
+    )
+
+    # example kernel (add deps to fix loop order)
+    knl = lp.make_kernel(
+        [
+            "{[i]: 0<=i<pi}",
+            "{[k]: 0<=k<pk}",
+            "{[j]: 0<=j<pj}",
+            "{[t]: 0<=t<pt}",
+        ],
+        """
+        for i
+            for k
+                <>temp = b[i,k]  {id=insn_a}
+            end
+            for j
+                a[i,j] = temp + 1  {id=insn_b,dep=insn_a}
+                c[i,j] = d[i,j]  {id=insn_c,dep=insn_b}
+            end
+        end
+        for t
+            e[t] = f[t]  {id=insn_d, dep=insn_c}
+        end
+        """,
+        name="example",
+        assumptions="pi,pj,pk,pt >= 1",
+        lang_version=(2018, 2)
+        )
+    knl = lp.add_and_infer_dtypes(
+            knl,
+            {"b": np.float32, "d": np.float32, "f": np.float32})
+    knl = lp.prioritize_loops(knl, "i,k")
+    knl = lp.prioritize_loops(knl, "i,j")
+
+    # get a linearization
+    knl = preprocess_kernel(knl)
+    knl = get_one_linearized_kernel(knl)
+    linearization_items = knl.linearization
+
+    # Get pairwise schedules
+    insn_id_pairs = [
+        ("insn_a", "insn_b"),
+        ("insn_a", "insn_c"),
+        ("insn_a", "insn_d"),
+        ("insn_b", "insn_c"),
+        ("insn_b", "insn_d"),
+        ("insn_c", "insn_d"),
+        ]
+    sched_maps = get_schedules_for_statement_pairs(
+        knl,
+        linearization_items,
+        insn_id_pairs,
+        )
+
+    def check_sio_for_insn_pair(
+            insn_id_before,
+            insn_id_after,
+            expected_lex_dims,
+            expected_sio,
+            ):
+
+        # Get pairwise schedule
+        sched_map_before, sched_map_after = sched_maps[
+            (insn_id_before, insn_id_after)]
+
+        # Get map representing lexicographic ordering
+        sched_lex_order_map = get_lex_order_map_for_sched_space(sched_map_before)
+
+        # Get expected lex order map
+        expected_lex_order_map = create_lex_order_map(
+            n_dims=expected_lex_dims,
+            before_names=["%s%d'" % (LEX_VAR_PREFIX, i)
+                for i in range(expected_lex_dims)],
+            after_names=["%s%d" % (LEX_VAR_PREFIX, i)
+                for i in range(expected_lex_dims)],
+            )
+
+        assert sched_lex_order_map == expected_lex_order_map
+
+        # create statement instance ordering,
+        # maps each statement instance to all statement instances occuring later
+        sio = get_statement_ordering_map(
+            sched_map_before,
+            sched_map_after,
+            sched_lex_order_map,
+            )
+
+        sio_aligned = ensure_dim_names_match_and_align(sio, expected_sio)
+
+        assert sio_aligned == expected_sio
+
+    # Relationship between insn_a and insn_b ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pi, pj, pk] -> {{ "
+        "[{0}'=0, i', k'] -> [{0}=1, i, j] : "
+        "0 <= i' < pi and 0 <= k' < pk and 0 <= j < pj and 0 <= i < pi and i > i'; "
+        "[{0}'=0, i', k'] -> [{0}=1, i=i', j] : "
+        "0 <= i' < pi and 0 <= k' < pk and 0 <= j < pj "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_a", "insn_b", 2, expected_sio)
+
+    # Relationship between insn_a and insn_c ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pi, pj, pk] -> {{ "
+        "[{0}'=0, i', k'] -> [{0}=1, i, j] : "
+        "0 <= i' < pi and 0 <= k' < pk and 0 <= j < pj and 0 <= i < pi and i > i'; "
+        "[{0}'=0, i', k'] -> [{0}=1, i=i', j] : "
+        "0 <= i' < pi and 0 <= k' < pk and 0 <= j < pj "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_a", "insn_c", 2, expected_sio)
+
+    # Relationship between insn_a and insn_d ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pt, pi, pk] -> {{ "
+        "[{0}'=0, i', k'] -> [{0}=1, t] : "
+        "0 <= i' < pi and 0 <= k' < pk and 0 <= t < pt "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_a", "insn_d", 1, expected_sio)
+
+    # Relationship between insn_b and insn_c ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pi, pj] -> {{ "
+        "[{0}'=0, i', j'] -> [{0}=1, i, j] : "
+        "0 <= i' < pi and 0 <= j' < pj and i > i' and 0 <= i < pi and 0 <= j < pj; "
+        "[{0}'=0, i', j'] -> [{0}=1, i=i', j] : "
+        "0 <= i' < pi and 0 <= j' < pj and j > j' and 0 <= j < pj; "
+        "[{0}'=0, i', j'] -> [{0}=1, i=i', j=j'] : "
+        "0 <= i' < pi and 0 <= j' < pj "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_b", "insn_c", 3, expected_sio)
+
+    # Relationship between insn_b and insn_d ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pt, pi, pj] -> {{ "
+        "[{0}'=0, i', j'] -> [{0}=1, t] : "
+        "0 <= i' < pi and 0 <= j' < pj and 0 <= t < pt "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_b", "insn_d", 1, expected_sio)
+
+    # Relationship between insn_c and insn_d ---------------------------------------
+
+    expected_sio = isl.Map(
+        "[pt, pi, pj] -> {{ "
+        "[{0}'=0, i', j'] -> [{0}=1, t] : "
+        "0 <= i' < pi and 0 <= j' < pj and 0 <= t < pt "
+        "}}".format(STATEMENT_VAR_NAME)
+        )
+    # isl ignores these apostrophes, so explicitly add them
+    expected_sio = append_marker_to_isl_map_var_names(
+        expected_sio, isl.dim_type.in_, "'")
+
+    check_sio_for_insn_pair("insn_c", "insn_d", 1, expected_sio)
+
+# }}}
+
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 41b5315e890bbd8199a2a3b67fe4cf8b0ae48f8d..be595aaa5d837abcf9ff189c415e73f7393b78df 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2920,6 +2920,123 @@ def test_access_check_with_conditionals():
         lp.generate_code_v2(legal_but_nonaffine_condition_knl)
 
 
+def test_access_check_with_insn_predicates():
+    knl = lp.make_kernel(
+            "{[i]: 0<i<10}",
+            """
+            if i < 4
+              y[i] = 2*x[i]
+            end
+            """, [lp.GlobalArg("x", dtype=float, shape=(4,)), ...])
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
+def test_conditional_access_range_with_parameters(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            ["{[i]: 0 <= i < 10}",
+             "{[j]: 0 <= j < problem_size+2}"],
+            """
+            if i < 8 and j < problem_size
+                tmp[j, i] = tmp[j, i] + 1
+            end
+           """,
+            [lp.GlobalArg("tmp", shape=("problem_size", 8,), dtype=np.int64),
+             lp.ValueArg("problem_size", dtype=np.int64)])
+
+    assert np.array_equal(knl(queue, tmp=np.arange(80).reshape((10, 8)),
+                              problem_size=10)[1][0], np.arange(1, 81).reshape(
+                                (10, 8)))
+
+    # test a conditional that's only _half_ data-dependent to ensure the other
+    # half works
+    knl = lp.make_kernel(
+            ["{[i]: 0 <= i < 10}",
+             "{[j]: 0 <= j < problem_size}"],
+            """
+            if i < 8 and (j + offset) < problem_size
+                tmp[j, i] = tmp[j, i] + 1
+            end
+           """,
+            [lp.GlobalArg("tmp", shape=("problem_size", 8,), dtype=np.int64),
+             lp.ValueArg("problem_size", dtype=np.int64),
+             lp.ValueArg("offset", dtype=np.int64)])
+
+    assert np.array_equal(knl(queue, tmp=np.arange(80).reshape((10, 8)),
+                              problem_size=10,
+                              offset=0)[1][0], np.arange(1, 81).reshape(
+                                (10, 8)))
+
+
+def test_split_iname_within(ctx_factory):
+    # https://github.com/inducer/loopy/issues/163
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{ [i, j]: 0<=i<n and 0<=j<n }",
+        """
+        x[i, j] = 3 {id=a}
+        y[i, j] = 2 * y[i, j] {id=b}
+        """,
+        options=dict(write_code=True))
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "j", 4,
+                         outer_tag="g.0", inner_tag="l.0",
+                         within="id:a")
+    knl = lp.split_iname(knl, "i", 4,
+                         outer_tag="g.0", inner_tag="l.0",
+                         within="id:b")
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
+@pytest.mark.parametrize("base_type,exp_type", [(np.int32, np.uint32), (np.int64,
+    np.uint64), (np.int, np.float), (np.float, np.int), (np.int, np.int),
+    (np.float32, np.float64), (np.float64, np.float32)])
+def test_pow(ctx_factory, base_type, exp_type):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    def _make_random_np_array(shape, dtype):
+        from numpy.random import default_rng
+        rng = default_rng(0)
+        if isinstance(shape, int):
+            shape = (shape,)
+
+        dtype = np.dtype(dtype)
+        if dtype.kind in ["u", "i"]:
+            low = 0  # numpy might trigger error for -ve int exponents
+            high = 6  # choosing numbers to avoid overflow (undefined behavior)
+            return rng.integers(low=low, high=high, size=shape, dtype=dtype)
+        elif dtype.kind == "f":
+            return rng.random(*shape).astype(dtype)
+        else:
+            raise NotImplementedError()
+
+    base = _make_random_np_array(10, base_type)
+    power = _make_random_np_array(10, exp_type)
+    expected_result = base ** power
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            res[i] = base[i] ** power[i]
+            """)
+
+    knl = lp.add_dtypes(knl, {"base": base_type, "power": exp_type})
+
+    evt, (result,) = knl(queue, base=base, power=power)
+
+    assert result.dtype == expected_result.dtype
+
+    np.testing.assert_allclose(expected_result, result)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 757f59e865b350c8f452977d36f2639393923fad..bcdc542cb8c4eac50143b07ef09f1460f5abd9c5 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -1070,7 +1070,7 @@ def test_floor_div_coefficient_collector():
         [
             "for i_outer",
             "for j_outer",
-            "<> loc[i_inner,j_inner] = 3.14  {id=loc_init}",
+            "<> loc[i_inner,j_inner] = 3.14f  {id=loc_init}",
             "loc[i_inner,(j_inner+r+4) %% %d] = loc[i_inner,(j_inner+r) %% %d]"
             "  {id=add,dep=loc_init}" % (bsize, bsize),
             "out0[i_outer*16+i_inner,j_outer*16+j_inner] = loc[i_inner,j_inner]"
diff --git a/test/test_transform.py b/test/test_transform.py
index 546f86838929a70f42044c3894ad972ff9f354b9..daa659808d1e7aa12f51d7b4b897672aa3344874 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -670,6 +670,49 @@ def test_add_inames_for_unused_hw_axes(ctx_factory):
             parameters={"n": n})
 
 
+def test_rename_argument_of_domain_params(ctx_factory):
+    knl = lp.make_kernel(
+            "{[i, j]: 0<=i<n and 0<=j<m}",
+            """
+            y[i, j] = 2.0f
+            """)
+
+    knl = lp.rename_argument(knl, "n", "N")
+    knl = lp.rename_argument(knl, "m", "M")
+
+    # renamed variables should not appear in the code
+    code_str = lp.generate_code_v2(knl).device_code()
+    assert code_str.find("int const n") == -1
+    assert code_str.find("int const m") == -1
+    assert code_str.find("int const N") != -1
+    assert code_str.find("int const M") != -1
+
+    lp.auto_test_vs_ref(knl, ctx_factory(), knl, parameters={"M": 10, "N": 4})
+
+
+def test_rename_argument_with_auto_stride(ctx_factory):
+    from loopy.kernel.array import FixedStrideArrayDimTag
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            y[i] = x[i]
+            """, [lp.GlobalArg("x", dtype=float,
+                               shape=lp.auto,
+                               dim_tags=[FixedStrideArrayDimTag(lp.auto)]), ...])
+
+    knl = lp.rename_argument(knl, "x", "x_new")
+
+    code_str = lp.generate_code_v2(knl).device_code()
+    assert code_str.find("double const *__restrict__ x_new,") != -1
+    assert code_str.find("double const *__restrict__ x,") == -1
+
+    evt, (out, ) = knl(queue, x_new=np.random.rand(10))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])