diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e71ea2c6d053188c0e2211fdf7868c6a75cc9af0..fb90b51291951d952bf30c24c3fa7c08030a53d5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,6 +10,18 @@ Python 2.7 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+Python 2.6 AMD CPU:
+  script:
+  - export PY_EXE=python2.6
+  - export PYOPENCL_TEST=amd:pu
+  - export EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python2.6
+  - amd-cl-cpu
+  except:
+  - tags
 Python 3.4 AMD CPU:
   script:
   - export PY_EXE=python3.4
@@ -35,6 +47,19 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+Python 2.7 with legacy PyOpenCL:
+  script:
+  - export PY_EXE=python2.7
+  - export PYOPENCL_TEST=amd:pu
+  - export EXTRA_INSTALL="numpy mako"
+  - export REQUIREMENTS_TXT="requirements-old-pyopencl.txt"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python2.7
+  - pocl
+  except:
+  - tags
 # PyPy AMD CPU:
 #   script:
 #   - export PY_EXE=pypy
diff --git a/bin/loopy b/bin/loopy
index cb223c31a1eef3244bd21310722f4b84a539d77c..c8cbea1c615c396d3903df24472822cc918d3461 100644
--- a/bin/loopy
+++ b/bin/loopy
@@ -50,9 +50,10 @@ def main():
 
     parser = ArgumentParser(description="Stand-alone loopy frontend")
 
-    parser.add_argument("infile")
-    parser.add_argument("outfile")
-    parser.add_argument("--lang", default="loopy")
+    parser.add_argument("infile", metavar="INPUT_FILE")
+    parser.add_argument("outfile", default="-", metavar="OUTPUT_FILE",
+            help="Defaults to stdout ('-').", nargs='?')
+    parser.add_argument("--lang", metavar="LANGUAGE", help="loopy|fortran")
     parser.add_argument("--target")
     parser.add_argument("--name")
     parser.add_argument("--transform")
@@ -65,13 +66,30 @@ def main():
         from warnings import warn
         warn("--target option is deprecated and ignored")
 
+    lang = None
     if args.infile == "-":
         infile_content = sys.stdin.read()
     else:
+        from os.path import splitext
+        _, ext = splitext(args.infile)
+
+        lang = {
+                ".loopy": "loopy",
+                ".floopy": "fortran",
+                ".f90": "fortran",
+                ".fpp": "fortran",
+                }.get(ext)
         with open(args.infile, "r") as infile_fd:
             infile_content = infile_fd.read()
 
-    if args.lang == "loopy":
+    if args.lang is not None:
+        lang = args.lang
+
+    if lang is None:
+        raise RuntimeError("unable to deduce input language "
+                "(wrong input file extension? --lang flag?)")
+
+    if lang == "loopy":
         # {{{ path wrangling
 
         from os.path import dirname, abspath
@@ -115,7 +133,7 @@ def main():
 
         kernels = [kernel]
 
-    elif args.lang in ["fortran", "floopy", "fpp"]:
+    elif lang in ["fortran", "floopy", "fpp"]:
         pre_transform_code = None
         if args.transform:
             with open(args.transform, "r") as xform_fd:
@@ -168,10 +186,15 @@ def main():
         code, impl_arg_info = generate_code(kernel)
         codes.append(code)
 
-    if args.outfile == "-":
+    if args.outfile:
+        outfile, = args.outfile
+    else:
+        outfile = "-"
+
+    if outfile == "-":
         sys.stdout.write("\n\n".join(codes))
     else:
-        with open(args.outfile, "w") as outfile_fd:
+        with open(outfile, "w") as outfile_fd:
             outfile_fd.write("\n\n".join(codes))
 
 
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
index 7650624932eb4b6aa95e888410e8a0aa1a6d518a..3fcff944671291c652621afaa9cd82080605d73c 100644
--- a/build-helpers/loopy.spec
+++ b/build-helpers/loopy.spec
@@ -1,7 +1,13 @@
 # -*- mode: python -*-
 
+from os.path import basename, dirname, join
+from glob import glob
+
 single_file = True
 
+# This makes the executable spew debug info.
+debug = False
+
 from os.path import expanduser
 
 a = Analysis(['bin/loopy'],
@@ -11,6 +17,19 @@ a = Analysis(['bin/loopy'],
              runtime_hooks=None,
              excludes=["hedge", "meshpy", "pyopencl", "PIL"]
              )
+
+import ply.lex
+import ply.yacc
+
+
+a.datas += [
+  (join("py-src", "ply", "lex", basename(fn)), fn, "DATA")
+  for fn in glob(join(dirname(ply.lex.__file__), "*.py"))
+  ] + [
+  (join("py-src", "ply", "yacc", basename(fn)), fn, "DATA")
+  for fn in glob(join(dirname(ply.yacc.__file__), "*.py"))
+  ]
+
 pyz = PYZ(a.pure)
 
 if single_file:
@@ -20,7 +39,7 @@ if single_file:
               a.zipfiles,
               a.datas,
               name='loopy',
-              debug=False,
+              debug=debug,
               strip=None,
               upx=True,
               console=True)
@@ -29,7 +48,7 @@ else:
               a.scripts,
               exclude_binaries=True,
               name='loopy',
-              debug=False,
+              debug=debug,
               strip=None,
               upx=True,
               console=True)
diff --git a/build-helpers/make-linux-build-docker.sh b/build-helpers/make-linux-build-docker.sh
index 2deb2935e429a7d4281d3c09e884fb5df92125fc..90684a267e98f1976a31fd3036b824d2c2cdc1d8 100755
--- a/build-helpers/make-linux-build-docker.sh
+++ b/build-helpers/make-linux-build-docker.sh
@@ -1,5 +1,7 @@
 #! /bin/bash
 
+# should be run in this directory (build-helpers)
+
 set -e
 set -x
 
diff --git a/doc/reference.rst b/doc/reference.rst
index 6a42ed944249aeeeff07677ceb9fc46fd0c45cb4..e79f17554119c5efe9351f46b8c501c2c27d2387 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -408,7 +408,7 @@ Dealing with Substitution Rules
 
 .. autofunction:: extract_subst
 
-.. autofunction:: temporary_to_subst
+.. autofunction:: assignment_to_subst
 
 .. autofunction:: expand_subst
 
@@ -421,6 +421,8 @@ Caching, Precomputation and Prefetching
 
 .. autofunction:: buffer_array
 
+.. autofunction:: alias_temporaries
+
 Influencing data access
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -470,6 +472,11 @@ Arguments
 
 .. autofunction:: add_and_infer_dtypes
 
+Batching
+^^^^^^^^
+
+.. autofunction:: to_batched
+
 Finishing up
 ^^^^^^^^^^^^
 
@@ -528,7 +535,7 @@ Obtaining Kernel Statistics
 
 .. autofunction:: get_op_poly
 
-.. autofunction:: get_DRAM_access_poly
+.. autofunction:: get_gmem_access_poly
 
 .. autofunction:: get_barrier_poly
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index ef6867cd8d5729cb490a3fba8ad55e3ccc1ce9ea..c1ce360c88b962ef438809500413d747a3cd4b43 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1256,14 +1256,14 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 Counting array accesses
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_DRAM_access_poly` provides information on the number and type of
+:func:`loopy.get_gmem_access_poly` provides information on the number and type of
 array loads and stores being performed in a kernel. To demonstrate this, we'll
 continue using the kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_DRAM_access_poly
-    >>> load_store_map = get_DRAM_access_poly(knl)
+    >>> from loopy.statistics import get_gmem_access_poly
+    >>> load_store_map = get_gmem_access_poly(knl)
     >>> print(load_store_map)
     (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
     (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
@@ -1271,7 +1271,7 @@ continue using the kernel from the previous example:
     (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
     <BLANKLINE>
 
-:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(**
+:func:`loopy.get_gmem_access_poly` returns a mapping of **{(**
 :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)**
 **:** :class:`islpy.PwQPolynomial` **}**.
 
@@ -1313,7 +1313,7 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 ~~~~~~~~~~~
 
 Since we have not tagged any of the inames or parallelized the kernel across threads
-(which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers
+(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers
 the array accesses *uniform*. Now we'll parallelize the kernel and count the array
 accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
 this time, so we'll print the mapping manually to make it more legible:
@@ -1321,7 +1321,7 @@ this time, so we'll print the mapping manually to make it more legible:
 .. doctest::
 
     >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
-    >>> load_store_map = get_DRAM_access_poly(knl_consec)
+    >>> load_store_map = get_gmem_access_poly(knl_consec)
     >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)):
     ...     print("%s :\n%s\n" % (key, load_store_map.dict[key]))
     (dtype('float32'), 'consecutive', 'load') :
@@ -1368,7 +1368,7 @@ our parallelization of the kernel:
 .. doctest::
 
     >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
-    >>> load_store_map = get_DRAM_access_poly(knl_nonconsec)
+    >>> load_store_map = get_gmem_access_poly(knl_nonconsec)
     >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)):
     ...     print("%s :\n%s\n" % (key, load_store_map.dict[key]))
     (dtype('float32'), 'nonconsecutive', 'load') :
diff --git a/examples/fortran/run-floopy.sh b/examples/fortran/run-floopy.sh
deleted file mode 100755
index fcea2c8b5ed58eed8738ad263df62cdf687b3d0f..0000000000000000000000000000000000000000
--- a/examples/fortran/run-floopy.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#! /bin/sh
-
-NAME="$1"
-shift
-
-python $(which loopy) --lang=fpp "$NAME" - "$@"
diff --git a/examples/fortran/run-loopy.sh b/examples/fortran/run-loopy.sh
deleted file mode 100755
index f22f78424bc654e352fff2806120701d096d7068..0000000000000000000000000000000000000000
--- a/examples/fortran/run-loopy.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#! /bin/sh
-
-python $(which loopy) --lang=loopy "$NAME" - "$@"
diff --git a/loopy/__init__.py b/loopy/__init__.py
index a161e54783b5246cf6a9d7c587cc0ffc7da19425..8956856d4735a9554ed8c34741790ef1286d9e54 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -54,7 +54,7 @@ from loopy.kernel.tools import (
         add_and_infer_dtypes)
 from loopy.kernel.creation import make_kernel, UniqueName
 from loopy.library.reduction import register_reduction_parser
-from loopy.subst import extract_subst, expand_subst, temporary_to_subst
+from loopy.subst import extract_subst, expand_subst, assignment_to_subst
 from loopy.precompute import precompute
 from loopy.buffer import buffer_array
 from loopy.fusion import fuse_kernels
@@ -63,7 +63,8 @@ from loopy.padding import (split_arg_axis, find_padding_multiple,
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly
+from loopy.statistics import (get_op_poly, get_gmem_access_poly,
+        get_DRAM_access_poly, get_barrier_poly)
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -89,7 +90,7 @@ __all__ = [
 
         "register_reduction_parser",
 
-        "extract_subst", "expand_subst", "temporary_to_subst",
+        "extract_subst", "expand_subst", "assignment_to_subst",
         "precompute", "buffer_array",
         "fuse_kernels",
         "split_arg_axis", "find_padding_multiple", "add_padding",
@@ -103,7 +104,8 @@ __all__ = [
         "generate_loop_schedules", "get_one_scheduled_kernel",
         "generate_code", "generate_body",
 
-        "get_op_poly", "get_DRAM_access_poly", "get_barrier_poly",
+        "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly",
+        "get_barrier_poly",
 
         "CompiledKernel",
 
@@ -660,23 +662,93 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
 # }}}
 
 
-def rename_iname(knl, old_iname, new_iname, within):
+# {{{ rename_inames
+
+def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
     """
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
+    :arg existing_ok: execute even if *new_iname* already exists
     """
 
     var_name_gen = knl.get_var_name_generator()
 
-    if var_name_gen.is_name_conflicting(new_iname):
+    does_exist = var_name_gen.is_name_conflicting(new_iname)
+
+    if does_exist and not existing_ok:
         raise ValueError("iname '%s' conflicts with an existing identifier"
                 "--cannot rename" % new_iname)
 
-    knl = duplicate_inames(knl, [old_iname], within=within, new_inames=[new_iname])
+    if does_exist:
+        # {{{ check that the domains match up
+
+        dom = knl.get_inames_domain(frozenset((old_iname, new_iname)))
+
+        var_dict = dom.get_var_dict()
+        _, old_idx = var_dict[old_iname]
+        _, new_idx = var_dict[new_iname]
+
+        par_idx = dom.dim(dim_type.param)
+        dom_old = dom.move_dims(
+                dim_type.param, par_idx, dim_type.set, old_idx, 1)
+        dom_old = dom_old.move_dims(
+                dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1)
+        dom_old = dom_old.project_out(
+                dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1)
+
+        par_idx = dom.dim(dim_type.param)
+        dom_new = dom.move_dims(
+                dim_type.param, par_idx, dim_type.set, new_idx, 1)
+        dom_new = dom_new.move_dims(
+                dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1)
+        dom_new = dom_new.project_out(
+                dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1)
+
+        if not (dom_old <= dom_new and dom_new <= dom_old):
+            raise LoopyError(
+                    "inames {old} and {new} do not iterate over the same domain"
+                    .format(old=old_iname, new=new_iname))
+
+        # }}}
+
+        from pymbolic import var
+        subst_dict = {old_iname: var(new_iname)}
+
+        from loopy.context_matching import parse_stack_match
+        within = parse_stack_match(within)
+
+        from pymbolic.mapper.substitutor import make_subst_func
+        rule_mapping_context = SubstitutionRuleMappingContext(
+                knl.substitutions, var_name_gen)
+        ijoin = RuleAwareSubstitutionMapper(rule_mapping_context,
+                        make_subst_func(subst_dict), within)
+
+        knl = rule_mapping_context.finish_kernel(
+                ijoin.map_kernel(knl))
+
+        new_instructions = []
+        for insn in knl.instructions:
+            if (old_iname in insn.forced_iname_deps
+                    and within(knl, insn, ())):
+                insn = insn.copy(
+                        forced_iname_deps=(
+                            (insn.forced_iname_deps - frozenset([old_iname]))
+                            | frozenset([new_iname])))
+
+            new_instructions.append(insn)
+
+        knl = knl.copy(instructions=new_instructions)
+
+    else:
+        knl = duplicate_inames(
+                knl, [old_iname], within=within, new_inames=[new_iname])
+
     knl = remove_unused_inames(knl, [old_iname])
 
     return knl
 
+# }}}
+
 
 # {{{ link inames
 
@@ -1845,4 +1917,176 @@ def tag_instructions(kernel, new_tag, within=None):
 
 # }}}
 
+
+# {{{ alias_temporaries
+
+def alias_temporaries(knl, names, base_name_prefix=None):
+    """Sets all temporaries given by *names* to be backed by a single piece of
+    storage. Also introduces ordering structures ("groups") to prevent the
+    usage of each temporary to interfere with another.
+
+    :arg base_name_prefix: an identifier to be used for the common storage
+        area
+    """
+    gng = knl.get_group_name_generator()
+    group_names = [gng("tmpgrp_"+name) for name in names]
+
+    if base_name_prefix is None:
+        base_name_prefix = "temp_storage"
+
+    vng = knl.get_var_name_generator()
+    base_name = vng(base_name_prefix)
+
+    names_set = set(names)
+
+    new_insns = []
+    for insn in knl.instructions:
+        temp_deps = insn.dependency_names() & names_set
+
+        if not temp_deps:
+            new_insns.append(insn)
+            continue
+
+        if len(temp_deps) > 1:
+            raise LoopyError("Instruction {insn} refers to multiple of the "
+                    "temporaries being aliased, namely '{temps}'. Cannot alias."
+                    .format(
+                        insn=insn.id,
+                        temps=", ".join(temp_deps)))
+
+        temp_name, = temp_deps
+        temp_idx = names.index(temp_name)
+        group_name = group_names[temp_idx]
+        other_group_names = (
+                frozenset(group_names[:temp_idx])
+                | frozenset(group_names[temp_idx+1:]))
+
+        new_insns.append(
+                insn.copy(
+                    groups=insn.groups | frozenset([group_name]),
+                    conflicts_with_groups=(
+                        insn.conflicts_with_groups | other_group_names)))
+
+    new_temporary_variables = {}
+    for tv in six.itervalues(knl.temporary_variables):
+        if tv.name in names_set:
+            if tv.base_storage is not None:
+                raise LoopyError("temporary variable '{tv}' already has "
+                        "a defined storage array -- cannot alias"
+                        .format(tv=tv.name))
+
+            new_temporary_variables[tv.name] = \
+                    tv.copy(base_storage=base_name)
+        else:
+            new_temporary_variables[tv.name] = tv
+
+    return knl.copy(
+            instructions=new_insns,
+            temporary_variables=new_temporary_variables)
+
+# }}}
+
+
+# {{{ to_batched
+
+class _BatchVariableChanger(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, kernel, batch_varying_args,
+            batch_iname_expr):
+        super(_BatchVariableChanger, self).__init__(rule_mapping_context)
+
+        self.kernel = kernel
+        self.batch_varying_args = batch_varying_args
+        self.batch_iname_expr = batch_iname_expr
+
+    def needs_batch_subscript(self, name):
+        return (
+                name in self.kernel.temporary_variables
+                or
+                name in self.batch_varying_args)
+
+    def map_subscript(self, expr, expn_state):
+        if not self.needs_batch_subscript(expr.aggregate.name):
+            return super(_BatchVariableChanger, self).map_subscript(expr, expn_state)
+
+        idx = expr.index
+        if not isinstance(idx, tuple):
+            idx = (idx,)
+
+        return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx)
+
+    def map_variable(self, expr, expn_state):
+        if not self.needs_batch_subscript(expr.name):
+            return super(_BatchVariableChanger, self).map_variable(expr, expn_state)
+
+        return expr.aggregate[self.batch_iname_expr]
+
+
+def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"):
+    """Takes in a kernel that carries out an operation and returns a kernel
+    that carries out a batch of these operations.
+
+    :arg nbatches: the number of batches. May be a constant non-negative
+        integer or a string, which will be added as an integer argument.
+    :arg batch_varying_args: a list of argument names that depend vary per-batch.
+        Each such variable will have a batch index added.
+    """
+
+    from pymbolic import var
+
+    vng = knl.get_var_name_generator()
+    batch_iname = vng(batch_iname_prefix)
+    batch_iname_expr = var(batch_iname)
+
+    new_args = []
+
+    batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % {
+            "iname": batch_iname,
+            "nbatches": nbatches,
+            }
+
+    if not isinstance(nbatches, int):
+        batch_dom_str = "[%s] -> " % nbatches + batch_dom_str
+        new_args.append(ValueArg(nbatches, dtype=knl.index_dtype))
+
+        nbatches_expr = var(nbatches)
+    else:
+        nbatches_expr = nbatches
+
+    batch_domain = isl.BasicSet(batch_dom_str)
+    new_domains = [batch_domain] + knl.domains
+
+    for arg in knl.args:
+        if arg.name in batch_varying_args:
+            if isinstance(arg, ValueArg):
+                arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,),
+                        dim_tags="c")
+            else:
+                arg = arg.copy(
+                        shape=(nbatches_expr,) + arg.shape,
+                        dim_tags=("c",) * (len(arg.shape) + 1))
+
+        new_args.append(arg)
+
+    new_temps = {}
+
+    for temp in six.itervalues(knl.temporary_variables):
+        new_temps[temp.name] = temp.copy(
+                shape=(nbatches_expr,) + temp.shape,
+                dim_tags=("c",) * (len(arg.shape) + 1))
+
+    knl = knl.copy(
+            domains=new_domains,
+            args=new_args,
+            temporary_variables=new_temps)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            knl.substitutions, vng)
+    bvc = _BatchVariableChanger(rule_mapping_context,
+            knl, batch_varying_args, batch_iname_expr)
+    return rule_mapping_context.finish_kernel(
+            bvc.map_kernel(knl))
+
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index c264428279795d2d30ab488031758d7bb7468226..e5f88417df6ca555d6475c21257534f8e995e812 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -353,14 +353,16 @@ def auto_test_vs_ref(
         ref_knl, ctx, test_knl, op_count=[], op_label=[], parameters={},
         print_ref_code=False, print_code=True, warmup_rounds=2,
         dump_binary=False,
-        fills_entire_output=None, do_check=True, check_result=None
-        ):
+        fills_entire_output=None, do_check=True, check_result=None,
+        max_test_kernel_count=1,
+        quiet=False):
     """Compare results of `ref_knl` to the kernels generated by
     scheduling *test_knl*.
 
     :arg check_result: a callable with :class:`numpy.ndarray` arguments
         *(result, reference_result)* returning a a tuple (class:`bool`,
         message) indicating correctness/acceptability of the result
+    :arg max_test_kernel_count: Stop testing after this many *test_knl*
     """
 
     import pyopencl as cl
@@ -416,7 +418,7 @@ def auto_test_vs_ref(
             break
 
         ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
-        if print_ref_code:
+        if not quiet and print_ref_code:
             print(75*"-")
             print("Reference Code:")
             print(75*"-")
@@ -469,7 +471,7 @@ def auto_test_vs_ref(
         logger.info("%s (ref): run done" % ref_knl.name)
 
         ref_evt.wait()
-        ref_elapsed = 1e-9*(ref_evt.profile.END-ref_evt.profile.SUBMIT)
+        ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)
 
         break
 
@@ -488,28 +490,25 @@ def auto_test_vs_ref(
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     args = None
-    from loopy.kernel import LoopKernel
-    if not isinstance(test_knl, LoopKernel):
-        warn("Passing an iterable of kernels to auto_test_vs_ref "
-                "is deprecated--just pass the kernel instead. "
-                "Scheduling will be performed in auto_test_vs_ref.",
-                DeprecationWarning, stacklevel=2)
-
-        test_kernels = test_knl
+    from loopy.kernel import kernel_state
+    if test_knl.state not in [
+            kernel_state.PREPROCESSED,
+            kernel_state.SCHEDULED]:
+        test_knl = lp.preprocess_kernel(test_knl)
+
+    if not test_knl.schedule:
+        test_kernels = lp.generate_loop_schedules(test_knl)
     else:
-        from loopy.kernel import kernel_state
-        if test_knl.state not in [
-                kernel_state.PREPROCESSED,
-                kernel_state.SCHEDULED]:
-            test_knl = lp.preprocess_kernel(test_knl)
-
-        if not test_knl.schedule:
-            test_kernels = lp.generate_loop_schedules(test_knl)
-        else:
-            test_kernels = [test_knl]
+        test_kernels = [test_knl]
+
+    test_kernel_count = 0
 
     from loopy.preprocess import infer_unknown_types
     for i, kernel in enumerate(test_kernels):
+        test_kernel_count += 1
+        if test_kernel_count > max_test_kernel_count:
+            break
+
         kernel = infer_unknown_types(kernel, expect_completion=True)
 
         compiled = CompiledKernel(ctx, kernel)
@@ -521,16 +520,17 @@ def auto_test_vs_ref(
                     queue, ref_arg_data, parameters)
         args["out_host"] = False
 
-        print(75*"-")
-        print("Kernel #%d:" % i)
-        print(75*"-")
-        if print_code:
-            print(compiled.get_highlighted_code())
+        if not quiet:
             print(75*"-")
-        if dump_binary:
-            print(type(compiled.cl_program))
-            print(compiled.cl_program.binaries[0])
+            print("Kernel #%d:" % i)
             print(75*"-")
+            if print_code:
+                print(compiled.get_highlighted_code())
+                print(75*"-")
+            if dump_binary:
+                print(type(compiled.cl_program))
+                print(compiled.cl_program.binaries[0])
+                print(75*"-")
 
         logger.info("%s: run warmup" % (knl.name))
 
@@ -596,16 +596,15 @@ def auto_test_vs_ref(
             evt_start.wait()
             evt_end.wait()
 
-            elapsed = (1e-9*events[-1].profile.END
-                    - 1e-9*events[0].profile.SUBMIT) \
+            elapsed_event = (1e-9*events[-1].profile.END
+                    - 1e-9*events[0].profile.START) \
                     / timing_rounds
             try:
-                elapsed_evt_2 = "%g" % \
-                        ((1e-9*evt_end.profile.START
+                elapsed_event_marker = ((1e-9*evt_end.profile.START
                             - 1e-9*evt_start.profile.START)
                         / timing_rounds)
             except cl.RuntimeError:
-                elapsed_evt_2 = "<unavailable>"
+                elapsed_event_marker = None
 
             elapsed_wall = (stop_time-start_time)/timing_rounds
 
@@ -620,28 +619,36 @@ def auto_test_vs_ref(
         for cnt, lbl in zip(op_count, op_label):
             rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
 
-        print("elapsed: %g s event, %s s marker-event %g s wall "
-                "(%d rounds)%s" % (
-                    elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates))
+        if not quiet:
+            def format_float_or_none(v):
+                if v is None:
+                    return "<unavailable>"
+                else:
+                    return "%g" % v
+
+            print("elapsed: %s s event, %s s marker-event %s s wall "
+                    "(%d rounds)%s" % (
+                        format_float_or_none(elapsed_event),
+                        format_float_or_none(elapsed_event_marker),
+                        format_float_or_none(elapsed_wall), timing_rounds, rates))
 
         if do_check:
             ref_rates = ""
             for cnt, lbl in zip(op_count, op_label):
-                ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl)
-            print("ref: elapsed: %g s event, %g s wall%s" % (
-                    ref_elapsed, ref_elapsed_wall, ref_rates))
+                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
+            if not quiet:
+                print("ref: elapsed: %g s event, %g s wall%s" % (
+                        ref_elapsed_event, ref_elapsed_wall, ref_rates))
 
     # }}}
 
     result_dict = {}
-    result_dict["elapsed"] = elapsed
-    result_dict["elapsed_evt_2"] = elapsed_evt_2
+    result_dict["elapsed_event"] = elapsed_event
+    result_dict["elapsed_event_marker"] = elapsed_event_marker
     result_dict["elapsed_wall"] = elapsed_wall
     result_dict["timing_rounds"] = timing_rounds
-    result_dict["rates"] = rates
-    result_dict["ref_elapsed"] = elapsed
-    result_dict["ref_elapsed_wall"] = elapsed_wall
-    result_dict["ref_rates"] = ref_rates
+    result_dict["ref_elapsed_event"] = ref_elapsed_event
+    result_dict["ref_elapsed_wall"] = ref_elapsed_wall
 
     return result_dict
 
diff --git a/loopy/buffer.py b/loopy/buffer.py
index fdc3774b29f64ba5ae8c465076f48b805836d40b..1e6a137b551645a25145ddaaeb8eea40eea554af 100644
--- a/loopy/buffer.py
+++ b/loopy/buffer.py
@@ -29,9 +29,15 @@ from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
         SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
+from pytools.persistent_dict import PersistentDict
+from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
+from loopy.version import DATA_MODEL_VERSION
 
 from pymbolic import var
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 # {{{ replace array access
 
@@ -117,6 +123,11 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
 # }}}
 
 
+buffer_array_cache = PersistentDict("loopy-buffer-array-cachee"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
+# Adding an argument? also add something to the cache_key below.
 def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         store_expression=None, within=None, default_tag="l.auto",
         temporary_is_local=None, fetch_bounding_box=False):
@@ -173,6 +184,25 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     # }}}
 
+    # {{{ caching
+
+    from loopy import CACHING_ENABLED
+
+    cache_key = (kernel, var_name, tuple(buffer_inames),
+            PymbolicExpressionHashWrapper(init_expression),
+            PymbolicExpressionHashWrapper(store_expression), within,
+            default_tag, temporary_is_local, fetch_bounding_box)
+
+    if CACHING_ENABLED:
+        try:
+            result = buffer_array_cache[cache_key]
+            logger.info("%s: buffer_array cache hit" % kernel.name)
+            return result
+        except KeyError:
+            pass
+
+    # }}}
+
     var_name_gen = kernel.get_var_name_generator()
     within_inames = set()
 
@@ -413,6 +443,10 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     from loopy import tag_inames
     kernel = tag_inames(kernel, new_iname_to_tag)
 
+    if CACHING_ENABLED:
+        from loopy.preprocess import prepare_for_caching
+        buffer_array_cache[cache_key] = prepare_for_caching(kernel)
+
     return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/compiled.py b/loopy/compiled.py
index d8d127c0bc89f439569b8a016e485054c05f2bc2..6d4396b5a11ef99bcf7e8e0b03b5d9d7d8fb6d88 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -35,6 +35,9 @@ from pytools.py_codegen import (
         Indentation, PythonFunctionGenerator)
 from loopy.diagnostic import LoopyError
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 # {{{ object array argument packing
 
@@ -716,11 +719,15 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
     if not lsize_expr:
         lsize_expr = (1,)
 
+    def strify_tuple(t):
+        return "(%s,)" % (
+                ", ".join("int(%s)" % strify(t_i) for t_i in t))
+
     gen("_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, "
             "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
             % dict(
-                gsize=strify(gsize_expr),
-                lsize=strify(lsize_expr)))
+                gsize=strify_tuple(gsize_expr),
+                lsize=strify_tuple(lsize_expr)))
     gen("")
 
     # }}}
@@ -858,10 +865,13 @@ class CompiledKernel:
             code = invoke_editor(code, "code.cl")
 
         import pyopencl as cl
+
+        logger.info("%s: opencl compilation start" % self.kernel.name)
         cl_program = cl.Program(self.context, code)
         cl_kernel = getattr(
                 cl_program.build(options=kernel.options.cl_build_options),
                 kernel.name)
+        logger.info("%s: opencl compilation done" % self.kernel.name)
 
         return _CLKernelInfo(
                 kernel=kernel,
diff --git a/loopy/context_matching.py b/loopy/context_matching.py
index 61203ece2c38ae7beb385bd8b4758c3ce5eeeea8..a88e207002220a1be840114d71948869f566863d 100644
--- a/loopy/context_matching.py
+++ b/loopy/context_matching.py
@@ -94,11 +94,21 @@ class MatchExpressionBase(object):
     def __call__(self, kernel, matchable):
         raise NotImplementedError
 
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
 
 class AllMatchExpression(MatchExpressionBase):
     def __call__(self, kernel, matchable):
         return True
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "all_match_expr")
+
+    def __eq__(self, other):
+        return (type(self) == type(other))
+
 
 class AndMatchExpression(MatchExpressionBase):
     def __init__(self, children):
@@ -110,6 +120,14 @@ class AndMatchExpression(MatchExpressionBase):
     def __str__(self):
         return "(%s)" % (" and ".join(str(ch) for ch in self.children))
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "and_match_expr")
+        key_builder.rec(key_hash, self.children)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.children == other.children)
+
 
 class OrMatchExpression(MatchExpressionBase):
     def __init__(self, children):
@@ -121,6 +139,14 @@ class OrMatchExpression(MatchExpressionBase):
     def __str__(self):
         return "(%s)" % (" or ".join(str(ch) for ch in self.children))
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "or_match_expr")
+        key_builder.rec(key_hash, self.children)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.children == other.children)
+
 
 class NotMatchExpression(MatchExpressionBase):
     def __init__(self, child):
@@ -132,6 +158,14 @@ class NotMatchExpression(MatchExpressionBase):
     def __str__(self):
         return "(not %s)" % str(self.child)
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "not_match_expr")
+        key_builder.rec(key_hash, self.child)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.child == other.child)
+
 
 class GlobMatchExpressionBase(MatchExpressionBase):
     def __init__(self, glob):
@@ -146,6 +180,14 @@ class GlobMatchExpressionBase(MatchExpressionBase):
         descr = descr[:descr.find("Match")]
         return descr.lower() + ":" + self.glob
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, type(self).__name__)
+        key_builder.rec(key_hash, self.glob)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.glob == other.glob)
+
 
 class IdMatchExpression(GlobMatchExpressionBase):
     def __call__(self, kernel, matchable):
@@ -284,18 +326,31 @@ def parse_match(expr_str):
 # {{{ stack match objects
 
 class StackMatchComponent(object):
-    pass
+    def __ne__(self, other):
+        return not self.__eq__(other)
 
 
 class StackAllMatchComponent(StackMatchComponent):
     def __call__(self, kernel, stack):
         return True
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "all_match")
+
+    def __eq__(self, other):
+        return (type(self) == type(other))
+
 
 class StackBottomMatchComponent(StackMatchComponent):
     def __call__(self, kernel, stack):
         return not stack
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "bottom_match")
+
+    def __eq__(self, other):
+        return (type(self) == type(other))
+
 
 class StackItemMatchComponent(StackMatchComponent):
     def __init__(self, match_expr, inner_match):
@@ -312,6 +367,16 @@ class StackItemMatchComponent(StackMatchComponent):
 
         return self.inner_match(kernel, stack[1:])
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, "item_match")
+        key_builder.rec(key_hash, self.match_expr)
+        key_builder.rec(key_hash, self.inner_match)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.match_expr == other.match_expr
+                and self.inner_match == other.inner_match)
+
 
 class StackWildcardMatchComponent(StackMatchComponent):
     def __init__(self, inner_match):
@@ -348,6 +413,18 @@ class StackMatch(object):
     def __init__(self, root_component):
         self.root_component = root_component
 
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, self.root_component)
+
+    def __eq__(self, other):
+        return (
+                type(self) == type(other)
+                and
+                self.root_component == other.root_component)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
     def __call__(self, kernel, insn, rule_stack):
         """
         :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 97005cab607f1c1db5341ca3e29bd80ec7e761c1..8293035a9e9ae951c069136b8705673a7f4f7d93 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import re
 
 import six
+from six.moves import intern
 
 import loopy as lp
 import numpy as np
@@ -221,7 +222,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
     def add_expression_instruction(self, lhs, rhs):
         scope = self.scope_stack[-1]
 
-        new_id = "insn%d" % self.insn_id_counter
+        new_id = intern("insn%d" % self.insn_id_counter)
         self.insn_id_counter += 1
 
         if self.auto_dependencies and scope.previous_instruction_id:
@@ -447,7 +448,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
     def map_IfThen(self, node):
         scope = self.scope_stack[-1]
 
-        cond_name = "loopy_cond%d" % self.condition_id_counter
+        cond_name = intern("loopy_cond%d" % self.condition_id_counter)
         self.condition_id_counter += 1
         assert cond_name not in scope.type_map
 
@@ -543,6 +544,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
             loop_var_suffix += 1
             loopy_loop_var = loop_var + "_%d" % loop_var_suffix
 
+        loopy_loop_var = intern(loopy_loop_var)
+
         # }}}
 
         space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT,
diff --git a/loopy/fusion.py b/loopy/fusion.py
index c14d936afb4ff063bad9e9ff7e1189daadf15a5c..8845951ea293d5a0e66d457a4bcb8680db57623c 100644
--- a/loopy/fusion.py
+++ b/loopy/fusion.py
@@ -143,8 +143,12 @@ def _fuse_two_kernels(knla, knlb):
         else:
             if b_arg != knla.arg_dict[b_arg.name]:
                 raise LoopyError(
-                        "argument '%s' has inconsistent definition between "
-                        "the two kernels being merged" % b_arg.name)
+                        "argument '{arg_name}' has inconsistent definition between "
+                        "the two kernels being merged ({arg_a} <-> {arg_b})"
+                        .format(
+                            arg_name=b_arg.name,
+                            arg_a=str(knla.arg_dict[b_arg.name]),
+                            arg_b=str(b_arg)))
 
     # }}}
 
@@ -214,9 +218,9 @@ def _fuse_two_kernels(knla, knlb):
     assump_a, assump_b = isl.align_two(assump_a, assump_b)
 
     shared_param_names = list(
-            set(dom_a.get_var_dict(dim_type.set))
+            set(assump_a.get_var_dict(dim_type.set))
             &
-            set(dom_b.get_var_dict(dim_type.set)))
+            set(assump_b.get_var_dict(dim_type.set)))
 
     assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
     assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 16695e0ad26d3ec9d4f4b855aea15585117ff227..4e31db993517fc4426766a6e881fb7f56853915c 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -25,7 +25,7 @@ THE SOFTWARE.
 """
 
 import six
-from six.moves import range, zip
+from six.moves import range, zip, intern
 
 import numpy as np
 from pytools import RecordWithoutPickling, Record, memoize_method
@@ -335,6 +335,9 @@ class LoopKernel(RecordWithoutPickling):
     def all_variable_names(self):
         return (
                 set(six.iterkeys(self.temporary_variables))
+                | set(tv.base_storage
+                    for tv in six.itervalues(self.temporary_variables)
+                    if tv.base_storage is not None)
                 | set(six.iterkeys(self.substitutions))
                 | set(arg.name for arg in self.args)
                 | set(self.all_inames()))
@@ -351,7 +354,18 @@ class LoopKernel(RecordWithoutPickling):
 
         for id_str in generate_unique_names(based_on):
             if id_str not in used_ids:
-                return id_str
+                return intern(id_str)
+
+    def all_group_names(self):
+        result = set()
+        for insn in self.instructions:
+            result.update(insn.groups)
+            result.update(insn.conflicts_with_groups)
+
+        return frozenset(result)
+
+    def get_group_name_generator(self):
+        return _UniqueVarNameGenerator(set(self.all_group_names()))
 
     def get_var_descriptor(self, name):
         try:
@@ -577,7 +591,8 @@ class LoopKernel(RecordWithoutPickling):
     def all_inames(self):
         result = set()
         for dom in self.domains:
-            result.update(dom.get_var_names(dim_type.set))
+            result.update(
+                    intern(n) for n in dom.get_var_names(dim_type.set))
         return frozenset(result)
 
     @memoize_method
@@ -588,7 +603,8 @@ class LoopKernel(RecordWithoutPickling):
         for dom in self.domains:
             result.update(set(dom.get_var_names(dim_type.param)) - all_inames)
 
-        return frozenset(result)
+        from loopy.tools import intern_frozenset_of_ids
+        return intern_frozenset_of_ids(result)
 
     def outer_params(self, domains=None):
         if domains is None:
@@ -600,7 +616,8 @@ class LoopKernel(RecordWithoutPickling):
             all_inames.update(dom.get_var_names(dim_type.set))
             all_params.update(dom.get_var_names(dim_type.param))
 
-        return all_params-all_inames
+        from loopy.tools import intern_frozenset_of_ids
+        return intern_frozenset_of_ids(all_params-all_inames)
 
     @memoize_method
     def all_insn_inames(self):
@@ -747,6 +764,15 @@ class LoopKernel(RecordWithoutPickling):
                 for insn in self.instructions
                 for var_name, _ in insn.assignees_and_indices())
 
+    @memoize_method
+    def get_temporary_to_base_storage_map(self):
+        result = {}
+        for tv in six.itervalues(self.temporary_variables):
+            if tv.base_storage:
+                result[tv.name] = tv.base_storage
+
+        return result
+
     # }}}
 
     # {{{ argument wrangling
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 2923c945c08c6a5fc2d19831efcdf235ea949a72..9a3c9c0cfbc79e9aba934b7a7c051665b19c19c5 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -766,11 +766,9 @@ class ArrayBase(Record):
         info_entries.append("type: %s" % type_str)
 
         if self.shape is None:
-            pass
+            info_entries.append("shape: unknown")
         elif self.shape is lp.auto:
             info_entries.append("shape: auto")
-        elif self.shape == ():
-            pass
         else:
             info_entries.append("shape: (%s)"
                     % ", ".join(str(i) for i in self.shape))
@@ -874,11 +872,15 @@ class ArrayBase(Record):
 
         return 1
 
-    def decl_info(self, target, is_written, index_dtype):
+    def decl_info(self, target, is_written, index_dtype, shape_override=None):
         """Return a list of :class:`loopy.codegen.ImplementedDataInfo`
         instances corresponding to the array.
         """
 
+        array_shape = self.shape
+        if shape_override is not None:
+            array_shape = shape_override
+
         from loopy.codegen import ImplementedDataInfo
         from loopy.kernel.data import ValueArg
 
@@ -978,10 +980,10 @@ class ArrayBase(Record):
             dim_tag = self.dim_tags[user_axis]
 
             if isinstance(dim_tag, FixedStrideArrayDimTag):
-                if self.shape is None:
+                if array_shape is None:
                     new_shape_axis = None
                 else:
-                    new_shape_axis = self.shape[user_axis]
+                    new_shape_axis = array_shape[user_axis]
 
                 import loopy as lp
                 if dim_tag.stride is lp.auto:
@@ -1004,7 +1006,7 @@ class ArrayBase(Record):
                     yield res
 
             elif isinstance(dim_tag, SeparateArrayArrayDimTag):
-                shape_i = self.shape[user_axis]
+                shape_i = array_shape[user_axis]
                 if not is_integer(shape_i):
                     raise LoopyError("shape of '%s' has non-constant "
                             "integer axis %d (0-based)" % (
@@ -1018,7 +1020,7 @@ class ArrayBase(Record):
                         yield res
 
             elif isinstance(dim_tag, VectorArrayDimTag):
-                shape_i = self.shape[user_axis]
+                shape_i = array_shape[user_axis]
                 if not is_integer(shape_i):
                     raise LoopyError("shape of '%s' has non-constant "
                             "integer axis %d (0-based)" % (
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e891f06261c27d1dff446e7ec567dbe88c0647da..4683ca905c32cdcade15e32c9745ec353d886375 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 
 import numpy as np
+from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import IdentityMapper, WalkMapper
 from loopy.kernel.data import (
         InstructionBase, ExpressionInstruction, SubstitutionRule)
@@ -33,7 +34,7 @@ import islpy as isl
 from islpy import dim_type
 
 import six
-from six.moves import range, zip
+from six.moves import range, zip, intern
 
 import re
 import sys
@@ -216,7 +217,7 @@ def parse_insn(insn):
                     opt_value = option[equal_idx+1:].strip()
 
                 if opt_key == "id":
-                    insn_id = opt_value
+                    insn_id = intern(opt_value)
                 elif opt_key == "id_prefix":
                     insn_id = UniqueName(opt_value)
                 elif opt_key == "priority":
@@ -235,17 +236,18 @@ def parse_insn(insn):
                         insn_deps_is_final = True
                         opt_value = (opt_value[1:]).strip()
 
-                    insn_deps = frozenset(dep.strip() for dep in opt_value.split(":")
+                    insn_deps = frozenset(
+                            intern(dep.strip()) for dep in opt_value.split(":")
                             if dep.strip())
 
                 elif opt_key == "groups":
                     insn_groups = frozenset(
-                            grp.strip() for grp in opt_value.split(":")
+                            intern(grp.strip()) for grp in opt_value.split(":")
                             if grp.strip())
 
                 elif opt_key == "conflicts":
                     conflicts_with_groups = frozenset(
-                            grp.strip() for grp in opt_value.split(":")
+                            intern(grp.strip()) for grp in opt_value.split(":")
                             if grp.strip())
 
                 elif opt_key == "inames":
@@ -255,10 +257,10 @@ def parse_insn(insn):
                     else:
                         forced_iname_deps_is_final = True
 
-                    forced_iname_deps = frozenset(opt_value.split(":"))
+                    forced_iname_deps = intern_frozenset_of_ids(opt_value.split(":"))
 
                 elif opt_key == "if":
-                    predicates = frozenset(opt_value.split(":"))
+                    predicates = intern_frozenset_of_ids(opt_value.split(":"))
 
                 elif opt_key == "tags":
                     tags = tuple(
@@ -284,7 +286,10 @@ def parse_insn(insn):
                     "be variable or subscript" % lhs)
 
         return ExpressionInstruction(
-                    id=insn_id,
+                    id=(
+                        intern(insn_id)
+                        if isinstance(insn_id, str)
+                        else insn_id),
                     insn_deps=insn_deps,
                     insn_deps_is_final=insn_deps_is_final,
                     groups=insn_groups,
@@ -326,7 +331,17 @@ def parse_insn(insn):
 
 def parse_if_necessary(insn, defines):
     if isinstance(insn, InstructionBase):
-        yield insn, []
+        yield insn.copy(
+                id=intern(insn.id) if isinstance(insn.id, str) else insn.id,
+                insn_deps=frozenset(intern(dep) for dep in insn.insn_deps),
+                groups=frozenset(intern(grp) for grp in insn.groups),
+                conflicts_with_groups=frozenset(
+                    intern(grp) for grp in insn.conflicts_with_groups),
+                forced_iname_deps=frozenset(
+                    intern(iname) for iname in insn.forced_iname_deps),
+                predicates=frozenset(
+                    intern(pred) for pred in insn.predicates),
+                ), []
         return
     elif not isinstance(insn, str):
         raise TypeError("Instructions must be either an Instruction "
@@ -692,7 +707,7 @@ class CSEToAssignmentMapper(IdentityMapper):
             return var
 
 
-def expand_cses(knl):
+def expand_cses(instructions, cse_prefix="cse_expr"):
     def add_assignment(base_name, expr, dtype):
         if base_name is None:
             base_name = "var"
@@ -706,16 +721,15 @@ def expand_cses(knl):
             dtype = np.dtype(dtype)
 
         from loopy.kernel.data import TemporaryVariable
-        new_temp_vars[new_var_name] = TemporaryVariable(
+        new_temp_vars.append(TemporaryVariable(
                 name=new_var_name,
                 dtype=dtype,
                 is_local=lp.auto,
-                shape=())
+                shape=()))
 
         from pymbolic.primitives import Variable
         new_insn = ExpressionInstruction(
-                id=knl.make_unique_instruction_id(
-                    extra_used_ids=newly_created_insn_ids),
+                id=None,
                 assignee=Variable(new_var_name), expression=expr,
                 predicates=insn.predicates)
         newly_created_insn_ids.add(new_insn.id)
@@ -727,20 +741,19 @@ def expand_cses(knl):
 
     new_insns = []
 
-    var_name_gen = knl.get_var_name_generator()
+    from pytools import UniqueNameGenerator
+    var_name_gen = UniqueNameGenerator(forced_prefix=cse_prefix)
 
     newly_created_insn_ids = set()
-    new_temp_vars = knl.temporary_variables.copy()
+    new_temp_vars = []
 
-    for insn in knl.instructions:
+    for insn in instructions:
         if isinstance(insn, ExpressionInstruction):
             new_insns.append(insn.copy(expression=cseam(insn.expression)))
         else:
             new_insns.append(insn)
 
-    return knl.copy(
-            instructions=new_insns,
-            temporary_variables=new_temp_vars)
+    return (new_insns, new_temp_vars)
 
 # }}}
 
@@ -1169,6 +1182,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     # }}}
 
+    instructions, cse_temp_vars = expand_cses(instructions)
+    for tv in cse_temp_vars:
+        temporary_variables[tv.name] = tv
+    del cse_temp_vars
+
     domains = parse_domains(domains, defines)
 
     arg_guesser = ArgumentGuesser(domains, instructions,
@@ -1194,10 +1212,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     check_for_nonexistent_iname_deps(knl)
 
-    knl = tag_reduction_inames_as_sequential(knl)
     knl = create_temporaries(knl, default_order)
     knl = determine_shapes_of_temporaries(knl)
-    knl = expand_cses(knl)
+    knl = tag_reduction_inames_as_sequential(knl)
     knl = expand_defines_in_shapes(knl, defines)
     knl = guess_arg_shape_if_requested(knl, default_order)
     knl = apply_default_order_to_args(knl, default_order)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index b88929358afe99c1c9b5450c53df0f4cec7473fa..c5cecfde2fa4005669d1fca5f3439ca282f2c3c0 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 """
 
 
+from six.moves import intern
 import numpy as np
 from pytools import Record, memoize_method
 from loopy.kernel.array import ArrayBase
@@ -185,6 +186,8 @@ def parse_tag(tag):
 
 class KernelArgument(Record):
     def __init__(self, **kwargs):
+        kwargs["name"] = intern(kwargs.pop("name"))
+
         dtype = kwargs.pop("dtype", None)
 
         if isinstance(dtype, np.dtype):
@@ -312,6 +315,11 @@ class TemporaryVariable(ArrayBase):
         Whether this is temporary lives in ``local`` memory.
         May be *True*, *False*, or :class:`loopy.auto` if this is
         to be automatically determined.
+
+    .. attribute:: base_storage
+
+        The name of a storage array that is to be used to actually
+        hold the data in this temporary.
     """
 
     min_target_axes = 0
@@ -320,12 +328,14 @@ class TemporaryVariable(ArrayBase):
     allowed_extra_kwargs = [
             "storage_shape",
             "base_indices",
-            "is_local"
+            "is_local",
+            "base_storage"
             ]
 
     def __init__(self, name, dtype=None, shape=(), is_local=auto,
             dim_tags=None, offset=0, strides=None, order=None,
-            base_indices=None, storage_shape=None):
+            base_indices=None, storage_shape=None,
+            base_storage=None):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -339,35 +349,29 @@ class TemporaryVariable(ArrayBase):
         if base_indices is None:
             base_indices = (0,) * len(shape)
 
-        ArrayBase.__init__(self, name=name,
+        ArrayBase.__init__(self, name=intern(name),
                 dtype=dtype, shape=shape,
                 dim_tags=dim_tags, order="C",
                 base_indices=base_indices, is_local=is_local,
-                storage_shape=storage_shape)
+                storage_shape=storage_shape,
+                base_storage=base_storage)
 
     @property
     def nbytes(self):
-        from pytools import product
-        return product(si for si in self.shape)*self.dtype.itemsize
-
-    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
-        from cgen import ArrayOf
-        from loopy.codegen import POD  # uses the correct complex type
-        from cgen.opencl import CLLocal
-
-        temp_var_decl = POD(target, dtype, self.name)
-
-        # FIXME take into account storage_shape, or something like it
-        storage_shape = shape
+        shape = self.shape
+        if self.storage_shape is not None:
+            shape = self.storage_shape
 
-        if storage_shape:
-            temp_var_decl = ArrayOf(temp_var_decl,
-                    " * ".join(str(s) for s in storage_shape))
+        from pytools import product
+        return product(si for si in shape)*self.dtype.itemsize
 
-        if self.is_local:
-            temp_var_decl = CLLocal(temp_var_decl)
+    def decl_info(self, target, index_dtype):
+        return super(TemporaryVariable, self).decl_info(
+                target, is_written=True, index_dtype=index_dtype,
+                shape_override=self.storage_shape)
 
-        return temp_var_decl
+    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
+        return None
 
     def __str__(self):
         return self.stringify(include_typename=False)
@@ -512,6 +516,9 @@ class InstructionBase(Record):
             forced_iname_deps_is_final, forced_iname_deps, priority,
             boostable, boostable_into, predicates, tags):
 
+        if insn_deps is None:
+            insn_deps = frozenset()
+
         if groups is None:
             groups = frozenset()
 
@@ -531,6 +538,17 @@ class InstructionBase(Record):
         if tags is None:
             tags = ()
 
+        # Periodically reenable these and run the tests to ensure all
+        # performance-relevant identifiers are interned.
+        #
+        # from loopy.tools import is_interned
+        # assert is_interned(id)
+        # assert all(is_interned(dep) for dep in insn_deps)
+        # assert all(is_interned(grp) for grp in groups)
+        # assert all(is_interned(grp) for grp in conflicts_with_groups)
+        # assert all(is_interned(iname) for iname in forced_iname_deps)
+        # assert all(is_interned(pred) for pred in predicates)
+
         assert isinstance(forced_iname_deps, frozenset)
         assert isinstance(insn_deps, frozenset) or insn_deps is None
         assert isinstance(groups, frozenset)
@@ -650,6 +668,21 @@ class InstructionBase(Record):
 
     # }}}
 
+    def __setstate__(self, val):
+        super(InstructionBase, self).__setstate__(val)
+
+        from loopy.tools import intern_frozenset_of_ids
+
+        self.id = intern(self.id)
+        self.insn_deps = intern_frozenset_of_ids(self.insn_deps)
+        self.groups = intern_frozenset_of_ids(self.groups)
+        self.conflicts_with_groups = (
+                intern_frozenset_of_ids(self.conflicts_with_groups))
+        self.forced_iname_deps = (
+                intern_frozenset_of_ids(self.forced_iname_deps))
+        self.predicates = (
+                intern_frozenset_of_ids(self.predicates))
+
 # }}}
 
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index b59c40731d91d1689d2cd9c00884069d35f7856a..be6f32a9bf78fab306cb4acd3f45a1a4f2e66f34 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1,8 +1,6 @@
 """Operations on the kernel object."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -26,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+from six.moves import intern
 
 import numpy as np
 from islpy import dim_type
@@ -204,7 +204,7 @@ def find_all_insn_inames(kernel):
                     # current inames refer to.
 
                     if par in kernel.all_inames():
-                        inames_new.add(par)
+                        inames_new.add(intern(par))
 
                     # If something writes the bounds of a loop in which I'm
                     # sitting, I had better be in the inames that the writer is
diff --git a/loopy/precompute.py b/loopy/precompute.py
index b1df5f6786bfee3abf9d1d58dfb30b5bb3d72bd3..ee7f815cf90cd2e870af4b153435083f264503e3 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -136,7 +136,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
             access_descriptors, array_base_map,
             storage_axis_names, storage_axis_sources,
             non1_storage_axis_names,
-            temporary_name):
+            temporary_name, compute_insn_id):
         super(RuleInvocationReplacer, self).__init__(rule_mapping_context)
 
         self.subst_name = subst_name
@@ -151,6 +151,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
         self.non1_storage_axis_names = non1_storage_axis_names
 
         self.temporary_name = temporary_name
+        self.compute_insn_id = compute_insn_id
 
     def map_substitution(self, name, tag, arguments, expn_state):
         if not (
@@ -211,8 +212,26 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
         # further as compute expression has already been seen
         # by rule_mapping_context.
 
+        self.replaced_something = True
+
         return new_outer_expr
 
+    def map_kernel(self, kernel):
+        new_insns = []
+
+        for insn in kernel.instructions:
+            self.replaced_something = False
+
+            insn = insn.with_transformed_expressions(self, kernel, insn)
+
+            if self.replaced_something:
+                insn = insn.copy(
+                        insn_deps=insn.insn_deps | frozenset([self.compute_insn_id]))
+
+            new_insns.append(insn)
+
+        return kernel.copy(instructions=new_insns)
+
 # }}}
 
 
@@ -220,7 +239,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, temporary_name=None, precompute_inames=None,
         storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
         fetch_bounding_box=False, temporary_is_local=None,
-        insn_id=None):
+        compute_insn_id=None):
     """Precompute the expression described in the substitution rule determined by
     *subst_use* and store it in a temporary array. A precomputation needs two
     things to operate, a list of *sweep_inames* (order irrelevant) and an
@@ -280,7 +299,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         If the specified inames do not already exist, they will be
         created. If they do already exist, their loop domain is verified
         against the one required for this precomputation.
-    :arg insn_id: The ID of the instruction performing the precomputation.
+    :arg compute_insn_id: The ID of the instruction performing the precomputation.
 
     If `storage_axes` is not specified, it defaults to the arrangement
     `<direct sweep axes><arguments>` with the direct sweep axes being the
@@ -686,11 +705,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     # }}}
 
     from loopy.kernel.data import ExpressionInstruction
-    if insn_id is None:
-        insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name)
+    if compute_insn_id is None:
+        compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name)
 
     compute_insn = ExpressionInstruction(
-            id=insn_id,
+            id=compute_insn_id,
             assignee=assignee,
             expression=compute_expression)
 
@@ -703,7 +722,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             access_descriptors, abm,
             storage_axis_names, storage_axis_sources,
             non1_storage_axis_names,
-            temporary_name)
+            temporary_name, compute_insn_id)
 
     kernel = invr.map_kernel(kernel)
     kernel = kernel.copy(
diff --git a/loopy/schedule.py b/loopy/schedule.py
index b44569b97bf45038da949b3128c992ea87afba89..1d0dc1221a8280d73cdd03858bb72404c7c79afb 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -171,7 +171,7 @@ def find_used_inames_within(kernel, sched_index):
     return result
 
 
-def loop_nest_map(kernel):
+def find_loop_nest_map(kernel):
     """Returns a dictionary mapping inames to other inames that are
     always nested around them.
     """
@@ -212,6 +212,49 @@ def loop_nest_map(kernel):
     return result
 
 
+def find_loop_insn_dep_map(kernel, loop_nest_map):
+    """Returns a dictionary mapping inames to other instruction ids that need to
+    be scheduled before the iname should be eligible for scheduling.
+    """
+
+    result = {}
+
+    from loopy.kernel.data import ParallelTag
+    for insn in kernel.instructions:
+        for iname in kernel.insn_inames(insn):
+            if isinstance(kernel.iname_to_tag.get(iname), ParallelTag):
+                continue
+
+            for dep_insn_id in insn.insn_deps:
+                dep_insn = kernel.id_to_insn[dep_insn_id]
+                dep_insn_inames = kernel.insn_inames(dep_insn)
+
+                if iname in dep_insn_inames:
+                    # Nothing to be learned, dependency is in loop over iname
+                    # already.
+                    continue
+
+                # To make sure dep_insn belongs outside of iname, we must prove
+                # (via loop_nest_map) that all inames that dep_insn will be
+                # executed inside are nested *around* iname.
+                if not dep_insn_inames <= loop_nest_map[iname]:
+                    continue
+
+                iname_dep = result.setdefault(iname, set())
+                if dep_insn_id not in iname_dep:
+                    logger.debug("{knl}: loop dependency map: iname '{iname}' "
+                            "depends on '{dep_insn}' via '{insn}'"
+                            .format(
+                                knl=kernel.name,
+                                iname=iname,
+                                dep_insn=dep_insn_id,
+                                insn=insn.id))
+
+                    iname_dep.add(dep_insn_id)
+
+    return result
+
+
 def group_insn_counts(kernel):
     result = {}
 
@@ -422,10 +465,14 @@ def generate_loop_schedules_internal(
     reachable_insn_ids = set()
     active_groups = frozenset(sched_state.active_group_counts)
 
-    for insn_id in sorted(sched_state.unscheduled_insn_ids,
-            key=lambda insn_id: kernel.id_to_insn[insn_id].priority,
-            reverse=True):
+    def insn_sort_key(insn_id):
+        insn = kernel.id_to_insn[insn_id]
+        return (insn.priority, len(active_groups & insn.groups))
+
+    insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids,
+            key=insn_sort_key, reverse=True)
 
+    for insn_id in insn_ids_to_try:
         insn = kernel.id_to_insn[insn_id]
 
         is_ready = insn.insn_deps <= sched_state.scheduled_insn_ids
@@ -497,7 +544,7 @@ def generate_loop_schedules_internal(
 
                     else:
                         new_active_group_counts[grp] = (
-                                sched_state.group_insn_counts[grp])
+                                sched_state.group_insn_counts[grp] - 1)
 
             else:
                 new_active_group_counts = sched_state.active_group_counts
@@ -522,7 +569,10 @@ def generate_loop_schedules_internal(
                     allow_insn=True):
                 yield sub_sched
 
-            return
+            if not sched_state.group_insn_counts:
+                # No groups: We won't need to backtrack on scheduling
+                # instructions.
+                return
 
     # }}}
 
@@ -603,6 +653,9 @@ def generate_loop_schedules_internal(
         print("active inames :", ",".join(sched_state.active_inames))
         print("inames entered so far :", ",".join(sched_state.entered_inames))
         print("reachable insns:", ",".join(reachable_insn_ids))
+        print("active groups (with insn counts):", ",".join(
+            "%s: %d" % (grp, c)
+            for grp, c in six.iteritems(sched_state.active_group_counts)))
         print(75*"-")
 
     if needed_inames:
@@ -619,6 +672,22 @@ def generate_loop_schedules_internal(
                     print("scheduling %s prohibited by loop nest map" % iname)
                 continue
 
+            if (
+                    not sched_state.loop_insn_dep_map.get(iname, set())
+                    <= sched_state.scheduled_insn_ids):
+                if debug_mode:
+                    print(
+                            "scheduling {iname} prohibited by loop dependency map "
+                            "(needs '{needed_insns})'"
+                            .format(
+                                iname=iname,
+                                needed_insns=", ".join(
+                                    sched_state.loop_insn_dep_map.get(iname, set())
+                                    -
+                                    sched_state.scheduled_insn_ids)))
+
+                continue
+
             iname_home_domain = kernel.domains[kernel.get_home_domain_index(iname)]
             from islpy import dim_type
             iname_home_domain_params = set(
@@ -795,6 +864,11 @@ class DependencyRecord(Record):
 
         A :class:`loopy.InstructionBase` instance.
 
+    .. attribute:: dep_descr
+
+        A string containing a phrase describing the dependency. The variables
+        '{src}' and '{tgt}' will be replaced by their respective instruction IDs.
+
     .. attribute:: variable
 
         A string, the name of the variable that caused the dependency to arise.
@@ -802,23 +876,15 @@ class DependencyRecord(Record):
     .. attribute:: var_kind
 
         "global" or "local"
-
-    .. attribute:: is_forward
-
-        A :class:`bool` indicating whether this is a forward or reverse
-        dependency.
-
-        In a 'forward' dependency, the target depends on the source.
-        In a 'reverse' dependency, the source depends on the target.
     """
 
-    def __init__(self, source, target, variable, var_kind, is_forward):
+    def __init__(self, source, target, dep_descr, variable, var_kind):
         Record.__init__(self,
                 source=source,
                 target=target,
+                dep_descr=dep_descr,
                 variable=variable,
-                var_kind=var_kind,
-                is_forward=is_forward)
+                var_kind=var_kind)
 
 
 def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
@@ -827,7 +893,7 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     at least one write), then the function will return a tuple
     ``(target, source, var_name)``. Otherwise, it will return *None*.
 
-    This function finds  direct or indirect instruction dependencies, but does
+    This function finds direct or indirect instruction dependencies, but does
     not attempt to guess dependencies that exist based on common access to
     variables.
 
@@ -847,11 +913,30 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     if reverse:
         source, target = target, source
 
-    # Check that a dependency exists.
+    # {{{ check that a dependency exists
+
+    dep_descr = None
+
     target_deps = kernel.recursive_insn_dep_map()[target.id]
-    if source.id not in target_deps:
+    if source.id in target_deps:
+        if reverse:
+            dep_descr = "{src} rev-depends on {tgt}"
+        else:
+            dep_descr = "{tgt} depends on {src}"
+
+    grps = source.groups & target.conflicts_with_groups
+    if grps:
+        dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps)
+
+    grps = target.groups & source.conflicts_with_groups
+    if grps:
+        dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps)
+
+    if not dep_descr:
         return None
 
+    # }}}
+
     if var_kind == "local":
         relevant_vars = kernel.local_var_names()
     elif var_kind == "global":
@@ -859,11 +944,27 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     else:
         raise ValueError("unknown 'var_kind': %s" % var_kind)
 
-    tgt_write = set(target.assignee_var_names()) & relevant_vars
-    tgt_read = target.read_dependency_names() & relevant_vars
+    temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
+
+    def map_to_base_storage(var_names):
+        result = set(var_names)
 
-    src_write = set(source.assignee_var_names()) & relevant_vars
-    src_read = source.read_dependency_names() & relevant_vars
+        for name in var_names:
+            bs = temp_to_base_storage.get(name)
+            if bs is not None:
+                result.add(bs)
+
+        return result
+
+    tgt_write = map_to_base_storage(
+            set(target.assignee_var_names()) & relevant_vars)
+    tgt_read = map_to_base_storage(
+            target.read_dependency_names() & relevant_vars)
+
+    src_write = map_to_base_storage(
+            set(source.assignee_var_names()) & relevant_vars)
+    src_read = map_to_base_storage(
+            source.read_dependency_names() & relevant_vars)
 
     waw = tgt_write & src_write
     raw = tgt_read & src_write
@@ -873,9 +974,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
         return DependencyRecord(
                 source=source,
                 target=target,
+                dep_descr=dep_descr,
                 variable=var_name,
-                var_kind=var_kind,
-                is_forward=not reverse)
+                var_kind=var_kind)
 
     if source is target:
         return None
@@ -884,9 +985,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
         return DependencyRecord(
                 source=source,
                 target=target,
+                dep_descr=dep_descr,
                 variable=var_name,
-                var_kind=var_kind,
-                is_forward=not reverse)
+                var_kind=var_kind)
 
     return None
 
@@ -998,12 +1099,9 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
 
         comment = None
         if dep is not None:
-            if dep.is_forward:
-                comment = "for %s (%s depends on %s)" % (
-                        dep.variable, dep.target.id, dep.source.id)
-            else:
-                comment = "for %s (%s rev-depends on %s)" % (
-                        dep.variable, dep.source.id, dep.target.id)
+            comment = "for %s (%s)" % (
+                    dep.variable, dep.dep_descr.format(
+                        tgt=dep.target.id, src=dep.source.id))
 
         result.append(Barrier(comment=comment, kind=dep.var_kind))
 
@@ -1047,10 +1145,6 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
             # (for leading (before-first-barrier) bit of loop body)
             for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]):
                 search_set = candidates
-                if not reverse:
-                    # can limit search set in case of forward dep
-                    search_set = search_set \
-                            & kernel.recursive_insn_dep_map()[insn_id]
 
                 for dep_src_insn_id in search_set:
                     dep = get_barrier_needing_dependency(
@@ -1090,10 +1184,6 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
             i += 1
 
             search_set = candidates
-            if not reverse:
-                # can limit search set in case of forward dep
-                search_set = search_set \
-                        & kernel.recursive_insn_dep_map()[sched_item.insn_id]
 
             for dep_src_insn_id in search_set:
                 dep = get_barrier_needing_dependency(
@@ -1153,9 +1243,11 @@ def generate_loop_schedules(kernel, debug_args={}):
             iname for iname in kernel.all_inames()
             if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
 
+    loop_nest_map = find_loop_nest_map(kernel)
     sched_state = SchedulerState(
             kernel=kernel,
-            loop_nest_map=loop_nest_map(kernel),
+            loop_nest_map=loop_nest_map,
+            loop_insn_dep_map=find_loop_insn_dep_map(kernel, loop_nest_map),
             breakable_inames=ilp_inames,
             ilp_inames=ilp_inames,
             vec_inames=vec_inames,
@@ -1180,38 +1272,9 @@ def generate_loop_schedules(kernel, debug_args={}):
                 debug=debug, allow_boost=None),
             generate_loop_schedules_internal(sched_state,
                 debug=debug)]
-    for gen in generators:
-        for gen_sched in gen:
-            # gen_sched = insert_barriers(kernel, gen_sched,
-            #         reverse=False, kind="global")
-
-            # for sched_item in gen_sched:
-            #     if isinstance(sched_item, Barrier) and sched_item.kind == "global":
-            #         raise LoopyError("kernel requires a global barrier %s"
-            #                 % sched_item.comment)
-
-            gen_sched = insert_barriers(kernel, gen_sched,
-                    reverse=False, kind="local")
-
-            debug.stop()
-            yield kernel.copy(
-                    schedule=gen_sched,
-                    state=kernel_state.SCHEDULED)
-            debug.start()
-
-            schedule_count += 1
 
-        # if no-boost mode yielded a viable schedule, stop now
-        if schedule_count:
-            break
-
-    debug.done_scheduling()
-
-    if not schedule_count:
+    def print_longest_dead_end():
         if debug.interactive:
-            print(75*"-")
-            print("ERROR: Sorry--loo.py did not find a schedule for your kernel.")
-            print(75*"-")
             print("Loo.py will now show you the scheduler state at the point")
             print("where the longest (dead-end) schedule was generated, in the")
             print("the hope that some of this makes sense and helps you find")
@@ -1230,6 +1293,52 @@ def generate_loop_schedules(kernel, debug_args={}):
                     debug=debug):
                 pass
 
+    try:
+        for gen in generators:
+            for gen_sched in gen:
+                # gen_sched = insert_barriers(kernel, gen_sched,
+                #         reverse=False, kind="global")
+
+                # for sched_item in gen_sched:
+                #     if (
+                #             isinstance(sched_item, Barrier)
+                #             and sched_item.kind == "global"):
+                #         raise LoopyError("kernel requires a global barrier %s"
+                #                 % sched_item.comment)
+
+                debug.stop()
+
+                logger.info("%s: barrier insertion: start" % kernel.name)
+
+                gen_sched = insert_barriers(kernel, gen_sched,
+                        reverse=False, kind="local")
+
+                logger.info("%s: barrier insertion: done" % kernel.name)
+
+                yield kernel.copy(
+                        schedule=gen_sched,
+                        state=kernel_state.SCHEDULED)
+                debug.start()
+
+                schedule_count += 1
+
+            # if no-boost mode yielded a viable schedule, stop now
+            if schedule_count:
+                break
+
+    except KeyboardInterrupt:
+        print(75*"-")
+        print("Interrupted during scheduling")
+        print(75*"-")
+        print_longest_dead_end()
+        raise
+
+    debug.done_scheduling()
+    if not schedule_count:
+        print(75*"-")
+        print("ERROR: Sorry--loo.py did not find a schedule for your kernel.")
+        print(75*"-")
+        print_longest_dead_end()
         raise RuntimeError("no valid schedules found")
 
     logger.info("%s: schedule done" % kernel.name)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7281cd2b2cc06f3b9d578769609037aafc880e0f..d25ea3eaca44187e8ad7020b6ddfd9e5bedc95bc 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -55,13 +55,14 @@ class ToCountMap:
                                 "to {} {}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
-            return
+
         return self
 
     def __mul__(self, other):
         if isinstance(other, isl.PwQPolynomial):
-            return ToCountMap({index: self.dict[index]*other
-                                     for index in self.dict.keys()})
+            return ToCountMap(dict(
+                (index, self.dict[index]*other)
+                for index in self.dict.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
                                 "ToCountMap by {} {}."
@@ -206,7 +207,7 @@ class ExpressionOpCounter(CombineMapper):
                                   "map_slice not implemented.")
 
 
-class ExpressionSubscriptCounter(CombineMapper):
+class GlobalSubscriptCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
@@ -344,12 +345,12 @@ class ExpressionSubscriptCounter(CombineMapper):
     map_logical_and = map_logical_or
 
     def map_if(self, expr):
-        warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
                       "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("ExpressionSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
                       "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
@@ -357,22 +358,22 @@ class ExpressionSubscriptCounter(CombineMapper):
     map_max = map_min
 
     def map_common_subexpression(self, expr):
-        raise NotImplementedError("ExpressionSubscriptCounter encountered "
+        raise NotImplementedError("GlobalSubscriptCounter encountered "
                                   "common_subexpression, "
                                   "map_common_subexpression not implemented.")
 
     def map_substitution(self, expr):
-        raise NotImplementedError("ExpressionSubscriptCounter encountered "
+        raise NotImplementedError("GlobalSubscriptCounter encountered "
                                   "substitution, "
                                   "map_substitution not implemented.")
 
     def map_derivative(self, expr):
-        raise NotImplementedError("ExpressionSubscriptCounter encountered "
+        raise NotImplementedError("GlobalSubscriptCounter encountered "
                                   "derivative, "
                                   "map_derivative not implemented.")
 
     def map_slice(self, expr):
-        raise NotImplementedError("ExpressionSubscriptCounter encountered slice, "
+        raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
                                   "map_slice not implemented.")
 
 
@@ -449,9 +450,8 @@ def get_op_poly(knl):
     return op_poly
 
 
-def get_DRAM_access_poly(knl):  # for now just counting subscripts
-
-    """Count the number of DRAM accesses in a loopy kernel.
+def get_gmem_access_poly(knl):  # for now just counting subscripts
+    """Count the number of global memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                     counted.
@@ -476,7 +476,7 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
 
         # (first create loopy kernel and specify array data types)
 
-        subscript_map = get_DRAM_access_poly(knl)
+        subscript_map = get_gmem_access_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
 
         f32_uncoalesced_load = subscript_map.dict[
@@ -498,7 +498,7 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
     knl = preprocess_kernel(knl)
 
     subs_poly = 0
-    subscript_counter = ExpressionSubscriptCounter(knl)
+    subscript_counter = GlobalSubscriptCounter(knl)
     for insn in knl.instructions:
         insn_inames = knl.insn_inames(insn)
         inames_domain = knl.get_inames_domain(insn_inames)
@@ -517,6 +517,13 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
     return subs_poly
 
 
+def get_DRAM_access_poly(knl):
+    from warnings import warn
+    warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead",
+            DeprecationWarning, stacklevel=2)
+    return get_gmem_access_poly(knl)
+
+
 def get_barrier_poly(knl):
 
     """Count the number of barriers each thread encounters in a loopy kernel.
diff --git a/loopy/subst.py b/loopy/subst.py
index a0a031718962df3053b80058818b2f2a4b88d2c8..a29e950a1f32d660eb10147c8638612078e816aa 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -198,16 +198,16 @@ def extract_subst(kernel, subst_name, template, parameters=()):
             substitutions=new_substs)
 
 
-# {{{ temporary_to_subst
+# {{{ assignment_to_subst
 
-class TemporaryToSubstChanger(RuleAwareIdentityMapper):
-    def __init__(self, rule_mapping_context, temp_name, definition_insn_ids,
+class AssignmentToSubstChanger(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, lhs_name, definition_insn_ids,
             usage_to_definition, extra_arguments, within):
         self.var_name_gen = rule_mapping_context.make_unique_var_name
 
-        super(TemporaryToSubstChanger, self).__init__(rule_mapping_context)
+        super(AssignmentToSubstChanger, self).__init__(rule_mapping_context)
 
-        self.temp_name = temp_name
+        self.lhs_name = lhs_name
         self.definition_insn_ids = definition_insn_ids
         self.usage_to_definition = usage_to_definition
 
@@ -226,28 +226,28 @@ class TemporaryToSubstChanger(RuleAwareIdentityMapper):
         try:
             return self.definition_insn_id_to_subst_name[def_insn_id]
         except KeyError:
-            subst_name = self.var_name_gen(self.temp_name+"_subst")
+            subst_name = self.var_name_gen(self.lhs_name+"_subst")
             self.definition_insn_id_to_subst_name[def_insn_id] = subst_name
             return subst_name
 
     def map_variable(self, expr, expn_state):
-        if (expr.name == self.temp_name
+        if (expr.name == self.lhs_name
                 and expr.name not in expn_state.arg_context):
             result = self.transform_access(None, expn_state)
             if result is not None:
                 return result
 
-        return super(TemporaryToSubstChanger, self).map_variable(
+        return super(AssignmentToSubstChanger, self).map_variable(
                 expr, expn_state)
 
     def map_subscript(self, expr, expn_state):
-        if (expr.aggregate.name == self.temp_name
+        if (expr.aggregate.name == self.lhs_name
                 and expr.aggregate.name not in expn_state.arg_context):
             result = self.transform_access(expr.index, expn_state)
             if result is not None:
                 return result
 
-        return super(TemporaryToSubstChanger, self).map_subscript(
+        return super(AssignmentToSubstChanger, self).map_subscript(
                 expr, expn_state)
 
     def transform_access(self, index, expn_state):
@@ -280,26 +280,29 @@ class TemporaryToSubstChanger(RuleAwareIdentityMapper):
             return var(subst_name)(*index)
 
 
-def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
-    """Extract an assignment to a temporary variable
+def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
+        force_retain_argument=False):
+    """Extract an assignment (to a temporary variable or an argument)
     as a :ref:`substituiton-rule`. The temporary may be an array, in
     which case the array indices will become arguments to the substitution
     rule.
 
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
+    :arg force_retain_argument: If True and if *lhs_name* is an argument, it is
+        kept even if it is no longer referenced.
 
     This operation will change all usage sites
-    of *temp_name* matched by *within*. If there
-    are further usage sites of *temp_name*, then
-    the original assignment to *temp_name* as well
+    of *lhs_name* matched by *within*. If there
+    are further usage sites of *lhs_name*, then
+    the original assignment to *lhs_name* as well
     as the temporary variable is left in place.
     """
 
     if isinstance(extra_arguments, str):
         extra_arguments = tuple(s.strip() for s in extra_arguments.split(","))
 
-    # {{{ establish the relevant definition of temp_name for each usage site
+    # {{{ establish the relevant definition of lhs_name for each usage site
 
     dep_kernel = expand_subst(kernel)
     from loopy.preprocess import add_default_dependencies
@@ -313,11 +316,11 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
         def_id = set()
         for dep_id in insn.insn_deps:
             dep_insn = id_to_insn[dep_id]
-            if temp_name in dep_insn.write_dependency_names():
-                if temp_name in dep_insn.read_dependency_names():
+            if lhs_name in dep_insn.write_dependency_names():
+                if lhs_name in dep_insn.read_dependency_names():
                     raise LoopyError("instruction '%s' both reads *and* "
                             "writes '%s'--cannot transcribe to substitution "
-                            "rule" % (dep_id, temp_name))
+                            "rule" % (dep_id, lhs_name))
 
                 def_id.add(dep_id)
             else:
@@ -329,7 +332,7 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
             raise LoopyError("more than one write to '%s' found in "
                     "depdendencies of '%s'--definition cannot be resolved "
                     "(writer instructions ids: %s)"
-                    % (temp_name, usage_insn_id, ", ".join(def_id)))
+                    % (lhs_name, usage_insn_id, ", ".join(def_id)))
 
         if not def_id:
             return None
@@ -341,20 +344,20 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
     usage_to_definition = {}
 
     for insn in kernel.instructions:
-        if temp_name not in insn.read_dependency_names():
+        if lhs_name not in insn.read_dependency_names():
             continue
 
         def_id = get_relevant_definition_insn_id(insn.id)
         if def_id is None:
             raise LoopyError("no write to '%s' found in dependency tree "
                     "of '%s'--definition cannot be resolved"
-                    % (temp_name, insn.id))
+                    % (lhs_name, insn.id))
 
         usage_to_definition[insn.id] = def_id
 
     definition_insn_ids = set()
     for insn in kernel.instructions:
-        if temp_name in insn.write_dependency_names():
+        if lhs_name in insn.write_dependency_names():
             definition_insn_ids.add(insn.id)
 
     # }}}
@@ -364,8 +367,8 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
-    tts = TemporaryToSubstChanger(rule_mapping_context,
-            temp_name, definition_insn_ids,
+    tts = AssignmentToSubstChanger(rule_mapping_context,
+            lhs_name, definition_insn_ids,
             usage_to_definition, extra_arguments, within)
 
     kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel))
@@ -401,13 +404,28 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
 
     # {{{ delete temporary variable if possible
 
+    # (copied below if modified)
     new_temp_vars = kernel.temporary_variables
-    if not any(six.itervalues(tts.saw_unmatched_usage_sites)):
-        # All usage sites matched--they're now substitution rules.
-        # We can get rid of the variable.
+    new_args = kernel.args
 
-        new_temp_vars = new_temp_vars.copy()
-        del new_temp_vars[temp_name]
+    if lhs_name in kernel.temporary_variables:
+        if not any(six.itervalues(tts.saw_unmatched_usage_sites)):
+            # All usage sites matched--they're now substitution rules.
+            # We can get rid of the variable.
+
+            new_temp_vars = new_temp_vars.copy()
+            del new_temp_vars[lhs_name]
+
+    if lhs_name in kernel.arg_dict and not force_retain_argument:
+        if not any(six.itervalues(tts.saw_unmatched_usage_sites)):
+            # All usage sites matched--they're now substitution rules.
+            # We can get rid of the argument
+
+            new_args = new_args[:]
+            for i in range(len(new_args)):
+                if new_args[i].name == lhs_name:
+                    del new_args[i]
+                    break
 
     # }}}
 
@@ -423,6 +441,7 @@ def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None):
     return kernel.copy(
             substitutions=new_substs,
             temporary_variables=new_temp_vars,
+            args=new_args,
             )
 
 # }}}
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 28943644ec9fcf540257f519c6e338bc5e7d4806..ca71c21269add662dc1ef19a4437c9f297ec6477 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -36,9 +36,9 @@ class CTarget(TargetBase):
     @memoize_method
     def get_dtype_registry(self):
         from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_with_registry_with_c_types)
+                DTypeRegistry, fill_registry_with_c_types)
         result = DTypeRegistry()
-        fill_with_registry_with_c_types(result, respect_windows=False,
+        fill_registry_with_c_types(result, respect_windows=False,
                 include_bool=True)
         return result
 
@@ -85,14 +85,100 @@ class CTarget(TargetBase):
         from cgen import Block
         body = Block()
 
+        temp_decls = []
+
         # {{{ declare temporaries
 
-        body.extend(
-                idi.cgen_declarator
-                for tv in six.itervalues(kernel.temporary_variables)
-                for idi in tv.decl_info(
-                    kernel.target,
-                    is_written=True, index_dtype=kernel.index_dtype))
+        base_storage_sizes = {}
+        base_storage_to_is_local = {}
+        base_storage_to_align_bytes = {}
+
+        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute
+        from loopy.codegen import POD  # uses the correct complex type
+        from cgen.opencl import CLLocal
+
+        class ConstRestrictPointer(Pointer):
+            def get_decl_pair(self):
+                sub_tp, sub_decl = self.subdecl.get_decl_pair()
+                return sub_tp, ("*const restrict %s" % sub_decl)
+
+        for tv in six.itervalues(kernel.temporary_variables):
+            decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)
+
+            if not tv.base_storage:
+                for idi in decl_info:
+                    temp_var_decl = POD(self, idi.dtype, idi.name)
+
+                    if idi.shape:
+                        temp_var_decl = ArrayOf(temp_var_decl,
+                                " * ".join(str(s) for s in idi.shape))
+
+                    if tv.is_local:
+                        temp_var_decl = CLLocal(temp_var_decl)
+
+                    temp_decls.append(temp_var_decl)
+
+            else:
+                offset = 0
+                base_storage_sizes.setdefault(tv.base_storage, []).append(
+                        tv.nbytes)
+                base_storage_to_is_local.setdefault(tv.base_storage, []).append(
+                        tv.is_local)
+
+                align_size = tv.dtype.itemsize
+
+                from loopy.kernel.array import VectorArrayDimTag
+                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
+                    if isinstance(dim_tag, VectorArrayDimTag):
+                        align_size *= axis_len
+
+                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
+                        align_size)
+
+                for idi in decl_info:
+                    cast_decl = POD(self, idi.dtype, "")
+                    temp_var_decl = POD(self, idi.dtype, idi.name)
+
+                    if tv.is_local:
+                        cast_decl = CLLocal(cast_decl)
+                        temp_var_decl = CLLocal(temp_var_decl)
+
+                    # The 'restrict' part of this is a complete lie--of course
+                    # all these temporaries are aliased. But we're promising to
+                    # not use them to shovel data from one representation to the
+                    # other. That counts, right?
+
+                    cast_decl = ConstRestrictPointer(cast_decl)
+                    temp_var_decl = ConstRestrictPointer(temp_var_decl)
+
+                    cast_tp, cast_d = cast_decl.get_decl_pair()
+                    temp_var_decl = Initializer(
+                            temp_var_decl,
+                            "(%s %s) (%s + %s)" % (
+                                " ".join(cast_tp), cast_d,
+                                tv.base_storage,
+                                offset))
+
+                    temp_decls.append(temp_var_decl)
+
+                    from pytools import product
+                    offset += (
+                            idi.dtype.itemsize
+                            * product(si for si in idi.shape))
+
+        for bs_name, bs_sizes in six.iteritems(base_storage_sizes):
+            bs_var_decl = POD(self, np.int8, bs_name)
+            if base_storage_to_is_local[bs_name]:
+                bs_var_decl = CLLocal(bs_var_decl)
+
+            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))
+
+            alignment = max(base_storage_to_align_bytes[bs_name])
+            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)
+
+            body.append(bs_var_decl)
+
+        body.extend(temp_decls)
 
         # }}}
 
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index fb6ba114d9d906403d47b0aaf69e2fe4cef382f2..ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit fb6ba114d9d906403d47b0aaf69e2fe4cef382f2
+Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
index e4533b86dd24a8dca973ac9c8ffd022a4bed204b..eebe6f5da0b81fa9b4c1ac7b4cda0ba8b1ac283e 100644
--- a/loopy/target/opencl/__init__.py
+++ b/loopy/target/opencl/__init__.py
@@ -214,17 +214,13 @@ class OpenCLTarget(CTarget):
 
     @memoize_method
     def get_dtype_registry(self):
-        from loopy.target.c.compyte.dtypes import DTypeRegistry, fill_with_registry_with_c_types
-        result = DTypeRegistry()
-        fill_with_registry_with_c_types(result, respect_windows=False)
+        from loopy.target.c.compyte.dtypes import (DTypeRegistry,
+                fill_registry_with_opencl_c_types)
 
-        # complex number support left out
+        result = DTypeRegistry()
+        fill_registry_with_opencl_c_types(result)
 
-        # CL defines 'long' as 64-bit
-        result.get_or_register_dtype(
-                ["unsigned long", "unsigned long int"], np.uint64)
-        result.get_or_register_dtype(
-                ["signed long", "signed long int", "long int"], np.int64)
+        # no complex number support--needs PyOpenCLTarget
 
         _register_vector_types(result)
 
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index ee936680016b6808723076034c8486a49544e2bc..d13384534c70df602785d4189739a7bc86ed37db 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -233,6 +233,18 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
 
 # {{{ pyopencl tools
 
+class _LegacyTypeRegistryStub(object):
+    """Adapts legacy PyOpenCL type registry to be usable with PyOpenCLTarget."""
+
+    def get_or_register_dtype(self, names, dtype=None):
+        from pyopencl.compyte.dtypes import get_or_register_dtype
+        return get_or_register_dtype(names, dtype)
+
+    def dtype_to_ctype(self, dtype):
+        from pyopencl.compyte.dtypes import dtype_to_ctype
+        return dtype_to_ctype(dtype)
+
+
 class PyOpenCLTarget(OpenCLTarget):
     def __init__(self, device=None):
         super(PyOpenCLTarget, self).__init__()
@@ -260,8 +272,12 @@ class PyOpenCLTarget(OpenCLTarget):
         check_sizes(kernel, self.device)
 
     def get_dtype_registry(self):
-        from pyopencl.compyte.dtypes import TYPE_REGISTRY
-        return TYPE_REGISTRY
+        try:
+            from pyopencl.compyte.dtypes import TYPE_REGISTRY
+        except ImportError:
+            return _LegacyTypeRegistryStub()
+        else:
+            return TYPE_REGISTRY
 
     def is_vector_dtype(self, dtype):
         from pyopencl.array import vec
diff --git a/loopy/tools.py b/loopy/tools.py
index e734417d6095da768085fa8861c870114f071ec8..55b177bda4e6be03a985286fd4faf6322e257824 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -30,6 +30,7 @@ from loopy.symbolic import WalkMapper as LoopyWalkMapper
 from pymbolic.mapper.persistent_hash import (
         PersistentHashWalkMapper as PersistentHashWalkMapperBase)
 import six  # noqa
+from six.moves import intern
 
 
 if six.PY2:
@@ -95,6 +96,21 @@ class LoopyKeyBuilder(KeyBuilderBase):
         else:
             PersistentHashWalkMapper(key_hash)(key)
 
+
+class PymbolicExpressionHashWrapper(object):
+    def __init__(self, expression):
+        self.expression = expression
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.expression == other.expression)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.update_for_pymbolic_expression(key_hash, self.expression)
+
 # }}}
 
 
@@ -216,4 +232,13 @@ def remove_common_indentation(code, require_leading_newline=True,
 
 # }}}
 
+
+def is_interned(s):
+    return s is None or intern(s) is s
+
+
+def intern_frozenset_of_ids(fs):
+    return frozenset(intern(s) for s in fs)
+
+
 # vim: foldmethod=marker
diff --git a/loopy/version.py b/loopy/version.py
index 9f1378f162788c307c34b862e8fa0824929b9c6f..9598697b09afc091741cea5d8da37917dd88ce9d 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v10-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v11-islpy%s" % _islpy_version
diff --git a/requirements-old-pyopencl.txt b/requirements-old-pyopencl.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7d21187d16ec962816dd691d8d11813e9b0ff700
--- /dev/null
+++ b/requirements-old-pyopencl.txt
@@ -0,0 +1,10 @@
+git+git://github.com/inducer/pytools
+git+git://github.com/inducer/islpy
+cgen
+pyopencl==2015.1
+git+git://github.com/inducer/pymbolic
+
+hg+https://bitbucket.org/inducer/f2py
+
+# Optional, needed for using the C preprocessor on Fortran
+ply>=3.6
diff --git a/setup.py b/setup.py
index b3cefdac0ad16fa7b4ffdf9969a47ac509f7a257..1f1ea68769f71663dc719d274dc38b01b1f602ed 100644
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2014.2",
-          "pymbolic>=2014.1.1",
+          "pytools>=2015.1.3",
+          "pymbolic>=2015.2.1",
           "cgen>=2013.1.2",
           "islpy>=2014.2",
           "six",
diff --git a/test/test_fortran.py b/test/test_fortran.py
index c31c370076b681cb0593f38b6a4d92479541b872..a5b1b830bc8834637d5f4c609fff8232ef7449e6 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -123,7 +123,7 @@ def test_asterisk_in_shape(ctx_factory):
     knl(queue, inp=np.array([1, 2, 3.]), n=3)
 
 
-def test_temporary_to_subst(ctx_factory):
+def test_assignment_to_subst(ctx_factory):
     fortran_src = """
         subroutine fill(out, out2, inp, n)
           implicit none
@@ -143,13 +143,13 @@ def test_temporary_to_subst(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.temporary_to_subst(knl, "a", "i")
+    knl = lp.assignment_to_subst(knl, "a", "i")
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def test_temporary_to_subst_two_defs(ctx_factory):
+def test_assignment_to_subst_two_defs(ctx_factory):
     fortran_src = """
         subroutine fill(out, out2, inp, n)
           implicit none
@@ -170,13 +170,13 @@ def test_temporary_to_subst_two_defs(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.temporary_to_subst(knl, "a")
+    knl = lp.assignment_to_subst(knl, "a")
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def test_temporary_to_subst_indices(ctx_factory):
+def test_assignment_to_subst_indices(ctx_factory):
     fortran_src = """
         subroutine fill(out, out2, inp, n)
           implicit none
@@ -201,7 +201,7 @@ def test_temporary_to_subst_indices(ctx_factory):
     ref_knl = knl
 
     assert "a" in knl.temporary_variables
-    knl = lp.temporary_to_subst(knl, "a")
+    knl = lp.assignment_to_subst(knl, "a")
     assert "a" not in knl.temporary_variables
 
     ctx = ctx_factory()
@@ -235,7 +235,7 @@ def test_if(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.temporary_to_subst(knl, "a")
+    knl = lp.assignment_to_subst(knl, "a")
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
@@ -275,6 +275,8 @@ def test_tagged(ctx_factory):
     "i_inner,j_inner",
     ])
 def test_matmul(ctx_factory, buffer_inames):
+    logging.basicConfig(level=logging.INFO)
+
     fortran_src = """
         subroutine dgemm(m,n,l,a,b,c)
           implicit none
@@ -409,8 +411,8 @@ def test_fuse_kernels(ctx_factory):
     assert len(knl.temporary_variables) == 2
 
     # This is needed for correctness, otherwise ordering could foul things up.
-    knl = lp.temporary_to_subst(knl, "prev")
-    knl = lp.temporary_to_subst(knl, "prev_0")
+    knl = lp.assignment_to_subst(knl, "prev")
+    knl = lp.assignment_to_subst(knl, "prev_0")
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
diff --git a/test/test_linalg.py b/test/test_linalg.py
index c019eb67fbaba5e6d8983665b67002837225d9ad..c61d963903ff738b62924e31428a928a18afad60 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -554,40 +554,6 @@ def test_image_matrix_mul_ilp(ctx_factory):
             parameters={})
 
 
-@pytest.mark.skipif("sys.version_info < (2,6)")
-def test_ilp_race_matmul(ctx_factory):
-    dtype = np.float32
-    order = "C"
-
-    n = 9
-
-    knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i,j,k<%d}" % n,
-            [
-                "c[i, j] = sum(k, a[i, k]*b[k, j])"
-                ],
-            [
-                lp.ImageArg("a", dtype, shape=(n, n)),
-                lp.ImageArg("b", dtype, shape=(n, n)),
-                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
-                ],
-            name="matmul")
-
-    knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 2)
-    knl = lp.add_prefetch(knl, 'b', ["k_inner"])
-
-    with lp.CacheMode(False):
-        from loopy.diagnostic import WriteRaceConditionWarning
-        from warnings import catch_warnings
-        with catch_warnings(record=True) as warn_list:
-            knl = lp.preprocess_kernel(knl)
-            list(lp.generate_loop_schedules(knl))
-
-            assert any(isinstance(w.message, WriteRaceConditionWarning)
-                    for w in warn_list)
-
-
 def test_fancy_matrix_mul(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 17e0cc54359d2c5ae5a19042063b0c5a0603ca22..7cad3504859d199c0581c8d3248ebafe50a34c4a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2094,6 +2094,86 @@ def test_vectorize(ctx_factory):
             ref_knl, ctx, knl,
             parameters=dict(n=30))
 
+
+def test_alias_temporaries(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        """
+        times2(i) := 2*a[i]
+        times3(i) := 3*a[i]
+        times4(i) := 4*a[i]
+
+        x[i] = times2(i)
+        y[i] = times3(i)
+        z[i] = times4(i)
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
+
+    knl = lp.precompute(knl, "times2", "i_inner")
+    knl = lp.precompute(knl, "times3", "i_inner")
+    knl = lp.precompute(knl, "times4", "i_inner")
+
+    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=30))
+
+
+def test_fusion():
+    exp_kernel = lp.make_kernel(
+         ''' { [i]: 0<=i<n } ''',
+         ''' exp[i] = pow(E, z[i])''',
+         assumptions="n>0")
+
+    sum_kernel = lp.make_kernel(
+        '{ [j]: 0<=j<n }',
+        'out2 = sum(j, exp[j])',
+        assumptions='n>0')
+
+    knl = lp.fuse_kernels([exp_kernel, sum_kernel])
+
+    print(knl)
+
+
+def test_sci_notation_literal(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    set_kernel = lp.make_kernel(
+         ''' { [i]: 0<=i<12 } ''',
+         ''' out[i] = 1e-12''')
+
+    set_kernel = lp.set_options(set_kernel, write_cl=True)
+
+    evt, (out,) = set_kernel(queue)
+
+    assert (np.abs(out.get() - 1e-12) < 1e-20).all()
+
+
+def test_to_batched(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+         ''' { [i,j]: 0<=i,j<n } ''',
+         ''' out[i] = sum(j, a[i,j]*x[j])''')
+
+    bknl = lp.to_batched(knl, "nbatches", "out,x")
+
+    a = np.random.randn(5, 5)
+    x = np.random.randn(7, 5)
+
+    bknl(queue, a=a, x=x)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 87ed797e74fd709c29ad9d763e195ff46985ed96..a58ce6d582a8d03d622028156adff35c61009bc0 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -27,7 +27,7 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
-from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly
+from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
 import numpy as np
 
 
@@ -185,7 +185,7 @@ def test_op_counter_triangular_domain():
         assert flops == 78
 
 
-def test_DRAM_access_counter_basic():
+def test_gmem_access_counter_basic():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -199,7 +199,7 @@ def test_DRAM_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -222,7 +222,7 @@ def test_DRAM_access_counter_basic():
     assert f64 == n*m
 
 
-def test_DRAM_access_counter_reduction():
+def test_gmem_access_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -232,7 +232,7 @@ def test_DRAM_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -247,7 +247,7 @@ def test_DRAM_access_counter_reduction():
     assert f32 == n*l
 
 
-def test_DRAM_access_counter_logic():
+def test_gmem_access_counter_logic():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -259,7 +259,7 @@ def test_DRAM_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -278,7 +278,7 @@ def test_DRAM_access_counter_logic():
     assert f64 == n*m
 
 
-def test_DRAM_access_counter_specialops():
+def test_gmem_access_counter_specialops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -292,7 +292,7 @@ def test_DRAM_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -315,7 +315,7 @@ def test_DRAM_access_counter_specialops():
     assert f64 == n*m
 
 
-def test_DRAM_access_counter_bitwise():
+def test_gmem_access_counter_bitwise():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -332,7 +332,7 @@ def test_DRAM_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -347,7 +347,7 @@ def test_DRAM_access_counter_bitwise():
     assert i32 == n*m+n*m*l
 
 
-def test_DRAM_access_counter_mixed():
+def test_gmem_access_counter_mixed():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -363,7 +363,7 @@ def test_DRAM_access_counter_mixed():
     knl = lp.split_iname(knl, "j", 16)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = get_DRAM_access_poly(knl)  # noqa
+    poly = get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -386,7 +386,7 @@ def test_DRAM_access_counter_mixed():
     assert f32nonconsec == n*m*l
 
 
-def test_DRAM_access_counter_nonconsec():
+def test_gmem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -402,7 +402,7 @@ def test_DRAM_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = get_DRAM_access_poly(knl)  # noqa
+    poly = get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -425,7 +425,7 @@ def test_DRAM_access_counter_nonconsec():
     assert f32nonconsec == n*m*l
 
 
-def test_DRAM_access_counter_consec():
+def test_gmem_access_counter_consec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -440,7 +440,7 @@ def test_DRAM_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = get_DRAM_access_poly(knl)
+    poly = get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -541,7 +541,7 @@ def test_all_counters_parallel_matmul():
     assert f32ops == n*m*l*2
     assert i32ops == n*m*l*4 + l*n*4
 
-    subscript_map = get_DRAM_access_poly(knl)
+    subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map.dict[
                         (np.dtype(np.float32), 'nonconsecutive', 'load')
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})