diff --git a/doc/misc.rst b/doc/misc.rst
index 97bac9fec35d1960f0b8dceb9489f8399b72520c..347b5d098c8dc0e37bb72659c0b0de5a8b4e3704 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -101,6 +101,10 @@ In the meantime, you can generate code simply by saying::
     print(cg_result.host_code())
     print(cg_result.device_code())
 
+Additionally, for C-based languages, header defintions are available via::
+
+    loopy.generate_header(knl)
+
 For what types of codes does :mod:`loopy` work well?
 ----------------------------------------------------
 
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 3a15e3a585afc9a9b181ae21b202e20a104ad2a1..97d71f3e04051d45a2f911eb0f7b2eca7147b96b 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -242,6 +242,12 @@ These are usually key-value pairs. The following attributes are recognized:
       heuristic and indicate that the specified list of dependencies is
       exhaustive.
 
+* ``dep_query=...`` provides an alternative way of specifying instruction
+  dependencies. The given string is parsed as a match expression object by
+  :func:`loopy.match.parse_match`. Upon kernel generation, this match
+  expression is used to match instructions in the kernel and add them as
+  dependencies.
+
 * ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary
   the instructions with identifiers ``id1`` and ``id2`` to the, even if
   a dependency chain exists and variables are accessed in an apparently
@@ -251,6 +257,9 @@ These are usually key-value pairs. The following attributes are recognized:
   function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with
   ``id_prefix``.
 
+* ``nosync_query=...`` provides an alternative way of specifying ``nosync``,
+  just like ``dep_query`` and ``dep``.
+
 * ``priority=integer`` sets the instructions priority to the value
   ``integer``. Instructions with higher priority will be scheduled sooner,
   if possible. Note that the scheduler may still schedule a lower-priority
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index f16f8bfdbb26b716af27762d8502bff592496d7c..4a07b63330747aa69d7ed498e004d60b7c312a7b 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -114,11 +114,15 @@ Finishing up
 
 .. autofunction:: get_one_scheduled_kernel
 
+.. autofunction:: save_and_reload_temporaries
+
 .. autoclass:: GeneratedProgram
 .. autoclass:: CodeGenerationResult
 
 .. autofunction:: generate_code_v2
 
+.. autofunction:: generate_header
+
 Setting options
 ---------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index fa6fcc95088198c28f17b2e383a54eb961419467..7c8ba2fc975265a7a76864b0de060ec58e492217 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -53,6 +53,13 @@ And some data on the host:
 
 .. }}}
 
+We'll also disable console syntax highlighting because it confuses
+doctest::
+
+    >>> # not a documented interface
+    >>> import loopy.options
+    >>> loopy.options.ALLOW_TERMINAL_COLORS = False
+
 Getting started
 ---------------
 
@@ -256,6 +263,14 @@ call :func:`loopy.generate_code`:
         out[i] = 2.0f * a[i];
     }
 
+Additionally, for C-based languages, header definitions can be obtained via
+the :func:`loopy.generate_header`:
+
+.. doctest::
+    >>> header = str(lp.generate_header(typed_knl)[0])
+    >>> print(header)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out);
+
 .. }}}
 
 .. _ordering:
@@ -532,9 +547,8 @@ Consider this example:
     #define lid(N) ((int) get_local_id(N))
     ...
       for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer)
-        for (int i_inner = 0; i_inner <= 15; ++i_inner)
-          if (-1 + -1 * i_inner + -16 * i_outer + n >= 0)
-            a[16 * i_outer + i_inner] = 0.0f;
+        for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner)
+          a[16 * i_outer + i_inner] = 0.0f;
     ...
 
 By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
@@ -563,10 +577,9 @@ relation to loop nesting. For example, it's perfectly possible to request
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-      for (int i_inner = 0; i_inner <= 15; ++i_inner)
-        if (-1 + -1 * i_inner + n >= 0)
-          for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-            a[16 * i_outer + i_inner] = 0.0f;
+      for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner)
+        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+          a[16 * i_outer + i_inner] = 0.0f;
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
@@ -791,7 +804,9 @@ enabling some cost savings:
         a[4 * i_outer + 3] = 0.0f;
       }
       /* final slab for 'i_outer' */
-      for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer)
+      {
+        int const i_outer = -1 + n + -1 * (3 * n / 4);
+    <BLANKLINE>
         if (-1 + n >= 0)
         {
           a[4 * i_outer] = 0.0f;
@@ -802,6 +817,7 @@ enabling some cost savings:
           if (4 + 4 * i_outer + -1 * n == 0)
             a[4 * i_outer + 3] = 0.0f;
         }
+      }
     ...
 
 .. }}}
@@ -906,6 +922,8 @@ expression being assigned.
     ...     """)
     >>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
 
+.. _local_temporaries:
+
 Temporaries in local memory
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1048,30 +1066,156 @@ Generic Precomputation
 
 .. }}}
 
-.. _more-complicated-programs:
 
-More complicated programs
--------------------------
+.. _synchronization:
+
+Synchronization
+---------------
 
 .. {{{
 
-SCOP
+In OpenCL, writes are not generally guaranteed to be immediately visible to
+other work items. In order to ensure that memory is consistent across work
+items, some sort of synchronization operation is used.
 
-Data-dependent control flow
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:mod:`loopy` supports synchronization in the form of *barriers* or *atomic
+operations*.
 
-Conditionals
-~~~~~~~~~~~~
+Barriers
+~~~~~~~~
 
-Snippets of C
-~~~~~~~~~~~~~
+Prior to code generation, :mod:`loopy` performs a check to see that every memory
+access is free of dependencies requiring a barrier. A memory access dependency
+that exists across multiple work items requires a barrier if it involves at
+least one write operation.
+
+:mod:`loopy` supports two kinds of barriers:
+
+* *Local barriers* ensure consistency of local memory accesses to items within
+  *the same* work group. As in OpenCL, all work items in the group are required
+  to wait until everyone has reached the barrier instruction before continuing.
+
+* *Global barriers* ensure consistency of *global* memory accesses across *all*
+  work groups. Note that there is no exact equivalent in OpenCL. All work items
+  across all work groups are required to wait until everyone has reached the
+  barrier instruction before continuing.
+
+By default, :mod:`loopy` inserts local barriers between two instructions when it
+detects that a dependency involving local memory may occur across work items. To
+see this in action, take a look at the section on :ref:`local_temporaries`.
+
+In contrast, :mod:`loopy` will *not* insert global barriers
+automatically. Consider the following kernel, which attempts to rotate its input
+to the right by 1:
+
+.. doctest::
+
+   >>> knl = lp.make_kernel(
+   ...     "[n] -> {[i] : 0<=i<n}",
+   ...     """
+   ...     for i
+   ...        <>tmp = arr[i] {id=maketmp,dep=*}
+   ...        arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp}
+   ...     end
+   ...     """,
+   ...      [
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
+   ...          "...",
+   ...      ],
+   ...     name="rotate_v1",
+   ...     assumptions="n mod 16 = 0")
+   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> cgr = lp.generate_code_v2(knl)
+   Traceback (most recent call last):
+   ...
+   MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+
+Because of the write-after-read dependency in global memory, a global barrier
+needs to be inserted. This can be accomplished with a ``... gbarrier``
+instruction. Note that :mod:`loopy` implements global barriers by splitting the
+kernel into multiple device-side kernels, so that the resulting code will
+contain more than one kernel.
+
+.. doctest::
+
+   >>> knl = lp.make_kernel(
+   ...     "[n] -> {[i] : 0<=i<n}",
+   ...     """
+   ...     for i
+   ...        <>tmp = arr[i] {id=maketmp,dep=*}
+   ...        ... gbarrier {id=bar,dep=*maketmp}
+   ...        arr[(i + 1) % n] = tmp {id=rotate,dep=*bar}
+   ...     end
+   ...     """,
+   ...      [
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
+   ...          "...",
+   ...      ],
+   ...     name="rotate_v2",
+   ...     assumptions="n mod 16 = 0")
+   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> cgr = lp.generate_code_v2(knl)
+   >>> print(cgr.device_code())
+   #define lid(N) ((int) get_local_id(N))
+   #define gid(N) ((int) get_group_id(N))
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = arr[16 * gid(0) + lid(0)];
+   }
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n)
+   {
+     int tmp;
+   <BLANKLINE>
+     arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
+   }
+
+Note that we are not done yet. The problem is that while `tmp` is assigned in
+the first kernel, the assignment of `tmp` is not saved for the second
+kernel. :mod:`loopy` provides a function called
+:func:`loopy.save_and_reload_temporaries` for the purpose of handling the
+situation of saving and restoring temporary values across global barriers. In
+order to use this function the kernel must be preprocessed and scheduled first,
+the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`.
+
+.. doctest::
+
+   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+   >>> knl = lp.save_and_reload_temporaries(knl)
+   >>> knl = lp.get_one_scheduled_kernel(knl)
+   >>> cgr = lp.generate_code_v2(knl)
+   >>> print(cgr.device_code())
+   #define lid(N) ((int) get_local_id(N))
+   #define gid(N) ((int) get_group_id(N))
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = arr[16 * gid(0) + lid(0)];
+     tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
+   }
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = tmp_save_slot[16 * gid(0) + lid(0)];
+     arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
+   }
+   >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True)
+   >>> print(out)
+   [15  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
 
 Atomic operations
 ~~~~~~~~~~~~~~~~~
 
-Loopy supports atomic operations. To use them, both the data on which the
-atomic operations work as well as the operations themselves must be
-suitably tagged, as in the following example::
+:mod:`loopy` supports atomic operations. To use them, both the data on which the
+atomic operations work as well as the operations themselves must be suitably
+tagged, as in the following example::
 
 
     knl = lp.make_kernel(
@@ -1086,6 +1230,49 @@ suitably tagged, as in the following example::
 
 .. }}}
 
+.. _more-complicated-programs:
+
+More complicated programs
+-------------------------
+
+.. {{{
+
+SCOP
+
+External Functions
+~~~~~~~~~~~~~~~~~~
+
+Loopy currently supports calls to several commonly used mathematical functions,
+e.g. exp/log, min/max, sin/cos/tan, sinh/cosh, abs, etc.  They may be used in
+a loopy kernel by simply calling them, e.g.::
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            for i
+                a[i] = sqrt(i)
+            end
+            """)
+
+Additionally, all functions of one variable are currently recognized during
+code-generation however additional implementation may be required for custom
+functions.  The full lists of available functions may be found in a the
+:class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
+
+Custom user functions may be represented using the method described in :ref:`_functions`
+
+
+Data-dependent control flow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Conditionals
+~~~~~~~~~~~~
+
+Snippets of C
+~~~~~~~~~~~~~
+
+.. }}}
+
 Common Problems
 ---------------
 
@@ -1213,26 +1400,30 @@ Obtaining Performance Statistics
 
 .. {{{
 
-Operations, array access, and barriers can all be counted, which may facilitate
-performance prediction and optimization of a :mod:`loopy` kernel.
+Arithmetic operations, array accesses, and synchronization operations can all
+be counted, which may facilitate performance prediction and optimization of a
+:mod:`loopy` kernel.
 
 .. note::
 
     The functions used in the following examples may produce warnings. If you have
     already made the filterwarnings and catch_warnings calls used in the examples
-    above, you may need to reset these before continuing:
+    above, you may want to reset these before continuing. We will temporarily
+    supress warnings to keep the output clean:
 
     .. doctest::
 
-        >>> from warnings import resetwarnings
+        >>> from warnings import resetwarnings, filterwarnings
         >>> resetwarnings()
+        >>> filterwarnings('ignore', category=Warning)
 
 Counting operations
 ~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_op_poly` provides information on the number and type of operations
-being performed in a kernel. To demonstrate this, we'll create an example kernel
-that performs several operations on arrays containing different types of data:
+:func:`loopy.get_op_map` provides information on the characteristics and
+quantity of arithmetic operations being performed in a kernel. To demonstrate
+this, we'll create an example kernel that performs several operations on arrays
+containing different types of data:
 
 .. doctest::
 
@@ -1250,38 +1441,42 @@ information provided. Now we will count the operations:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_op_poly
-    >>> op_map = get_op_poly(knl)
+    >>> op_map = lp.get_op_map(knl)
+    >>> print(lp.stringify_stats_mapping(op_map))
+    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
 
-:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** 
-:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The 
-:class:`islpy.PwQPolynomial` holds the number of operations for the type specified 
-in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this 
-map now:
+:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
+:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an
+arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the
+number of operations matching the characteristics of the :class:`loopy.Op`
+specified in the key (in terms of the :class:`loopy.LoopKernel`
+*inames*). :class:`loopy.Op` attributes include:
 
-.. doctest::
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type operated on.
 
-    >>> print(lp.stringify_stats_mapping(op_map))
-    (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    <BLANKLINE>
+- name: A :class:`str` that specifies the kind of arithmetic operation as
+  *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
 
-We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
-    >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
-    >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
-    >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
-    >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
+    >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
+    >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
     524288
@@ -1290,174 +1485,238 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
     65536
     65536
 
-Counting array accesses
-~~~~~~~~~~~~~~~~~~~~~~~
+:class:`loopy.ToCountMap` provides member functions that facilitate filtering,
+grouping, and evaluating subsets of the counts. Suppose we want to know the
+total number of 32-bit operations of any kind. We can easily count these
+using functions :func:`loopy.ToCountMap.filter_by` and
+:func:`loopy.ToCountMap.eval_and_sum`:
+
+.. doctest::
+
+    >>> filtered_op_map = op_map.filter_by(dtype=[np.float32])
+    >>> f32op_count = filtered_op_map.eval_and_sum(param_dict)
+    >>> print(f32op_count)
+    1572864
 
-:func:`loopy.get_gmem_access_poly` provides information on the number and type of
-array loads and stores being performed in a kernel. To demonstrate this, we'll
-continue using the kernel from the previous example:
+We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`,
+which produces a :class:`loopy.ToCountMap` that contains the same counts grouped
+together into keys containing only the specified fields:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_gmem_access_poly
-    >>> load_store_map = get_gmem_access_poly(knl)
-    >>> print(lp.stringify_stats_mapping(load_store_map))
-    (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    >>> op_map_dtype = op_map.group_by('dtype')
+    >>> print(lp.stringify_stats_mapping(op_map_dtype))
+    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
     <BLANKLINE>
+    >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
+    ...                           ].eval_with_dict(param_dict)
+    >>> print(f32op_count)
+    1572864
 
-:func:`loopy.get_gmem_access_poly` returns a mapping of **{(**
-:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)**
-**:** :class:`islpy.PwQPolynomial` **}**.
+See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
+more information on these functions.
+
+Counting memory accesses
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`loopy.get_mem_access_map` provides information on the number and
+characteristics of memory accesses performed in a kernel. To demonstrate this,
+we'll continue using the kernel from the previous example:
+
+.. doctest::
+
+    >>> mem_map = lp.get_mem_access_map(knl)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
 
-- The :class:`numpy.dtype` specifies the type of the data being accessed.
+:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
+:class:`loopy.MemAccess` attributes include:
 
-- The first string in the map key specifies the DRAM access type as *consecutive*,
-  *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when
-  consecutive threads access consecutive array elements in memory, *nonconsecutive*
-  accesses occur when consecutive threads access nonconsecutive array elements in
-  memory, and *uniform* accesses occur when consecutive threads access the *same*
-  element in memory.
+- mtype: A :class:`str` that specifies the memory type accessed as **global**
+  or **local**
 
-- The second string in the map key specifies the DRAM access type as a *load*, or a
-  *store*.
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type accessed.
 
-- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the
-  characteristics specified in the key (in terms of the :class:`loopy.LoopKernel`
-  *inames*).
+- stride: An :class:`int` that specifies stride of the memory access. A stride
+  of 0 indicates a uniform access (i.e. all threads access the same item).
+
+- direction: A :class:`str` that specifies the direction of memory access as
+  **load** or **store**.
+
+- variable: A :class:`str` that specifies the variable name of the data
+  accessed.
 
 We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
+
+:class:`loopy.ToCountMap` also makes it easy to determine the total amount
+of data moved in bytes. Suppose we want to know the total abount of global
+memory data loaded and stored. We can produce a map with just this information
+using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
+
+.. doctest::
+
+    >>> bytes_map = mem_map.to_bytes()
+    >>> print(lp.stringify_stats_mapping(bytes_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
+    ...                                         ).group_by('direction')
+    >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
+    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
+    ...                            ].eval_with_dict(param_dict)
+    >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
+    ...                            ].eval_with_dict(param_dict)
+    >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
+    bytes loaded: 7340032
+    bytes stored: 2621440
+
+One can see how these functions might be useful in computing, for example,
+achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 
 ~~~~~~~~~~~
 
-Since we have not tagged any of the inames or parallelized the kernel across threads
-(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers
-the array accesses *uniform*. Now we'll parallelize the kernel and count the array
-accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
-this time, so we'll print the mapping manually to make it more legible:
+Since we have not tagged any of the inames or parallelized the kernel across
+threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
+considers the memory accesses *uniform*, so the *stride* of each access is 0.
+Now we'll parallelize the kernel and count the array accesses again. The
+resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
 .. doctest::
 
-    >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
-    >>> load_store_map = get_gmem_access_poly(knl_consec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
+    >>> knl_consec = lp.split_iname(knl, "k", 128,
+    ...                             outer_tag="l.1", inner_tag="l.0")
+    >>> mem_map = lp.get_mem_access_map(knl_consec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-
 
 With this parallelization, consecutive threads will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
-parallelization, but when we evaluate them, we see that the total number of array
-accesses has not changed:
-
-.. doctest::
-
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+parallelization, but when we evaluate them, we see that the total number of
+array accesses has not changed:
+
+.. doctest::
+
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 ~~~~~~~~~~~
 
-To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in
-our parallelization of the kernel:
+To produce *nonconsecutive* array accesses with stride greater than 1, we'll
+switch the inner and outer tags in our parallelization of the kernel:
 
 .. doctest::
 
-    >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
-    >>> load_store_map = get_gmem_access_poly(knl_nonconsec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
+    >>> knl_nonconsec = lp.split_iname(knl, "k", 128,
+    ...                                outer_tag="l.0", inner_tag="l.1")
+    >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
 
+With this parallelization, consecutive threads will access *nonconsecutive*
+array elements in memory. The total number of array accesses still has not
+changed:
+
+.. doctest::
 
-With this parallelization, consecutive threads will access *nonconsecutive* array
-elements in memory. The total number of array accesses has not changed:
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
+
+We can also filter using an arbitrary test function using
+:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter
+criteria are more complicated than a simple list of allowable values:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> def f(key):
+    ...     from loopy.types import to_loopy_type
+    ...     return key.dtype == to_loopy_type(np.float32) and \
+    ...            key.stride > 1
+    >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
+    >>> print(count)
+    2097152
 
 Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_synchronization_poly` counts the number of synchronization
+:func:`loopy.get_synchronization_map` counts the number of synchronization
 events per **thread** in a kernel. First, we'll call this function on the
 kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_synchronization_poly
-    >>> barrier_poly = get_synchronization_poly(knl)
-    >>> print(lp.stringify_stats_mapping(barrier_poly))
+    >>> sync_map = lp.get_synchronization_map(knl)
+    >>> print(lp.stringify_stats_mapping(sync_map))
     kernel_launch : { 1 }
     <BLANKLINE>
 
@@ -1465,7 +1724,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
     >>> print("Kernel launch count: %s" % launch_count)
     Kernel launch count: 1
 
@@ -1495,36 +1754,38 @@ Now to make things more interesting, we'll create a kernel with barriers:
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-      int const k_outer = 0;
+      {
+        int const k_outer = 0;
     <BLANKLINE>
-      for (int j = 0; j <= 9; ++j)
-        for (int i = 0; i <= 49; ++i)
-        {
-          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
-          c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
-          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
-          e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
-        }
+        for (int j = 0; j <= 9; ++j)
+          for (int i = 0; i <= 49; ++i)
+          {
+            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
+            c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
+            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
+            e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
+          }
+      }
     }
 
-
-In this kernel, when a thread performs the second instruction it uses data produced
-by *different* threads during the first instruction. Because of this, barriers are
-required for correct execution, so loopy inserts them. Now we'll count the barriers
-using :func:`loopy.get_barrier_poly`:
+In this kernel, when a thread performs the second instruction it uses data
+produced by *different* threads during the first instruction. Because of this,
+barriers are required for correct execution, so loopy inserts them. Now we'll
+count the barriers using :func:`loopy.get_synchronization_map`:
 
 .. doctest::
 
-    >>> sync_map = lp.get_synchronization_poly(knl)
+    >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
     barrier_local : { 1000 }
     kernel_launch : { 1 }
     <BLANKLINE>
 
-Based on the kernel code printed above, we would expect each thread to encounter
-50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
-this case, the number of barriers does not depend on any inames, so we can pass an
-empty dictionary to :func:`islpy.eval_with_dict`.
+Based on the kernel code printed above, we would expect each thread to
+encounter 50x10x2 barriers, which matches the result from
+:func:`loopy.get_synchronization_map`. In this case, the number of barriers
+does not depend on any inames, so we can pass an empty dictionary to
+:func:`islpy.eval_with_dict`.
 
 .. }}}
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 21a41b11c9b84d288aa2cbb5146db23538613688..6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -106,16 +106,17 @@ from loopy.transform.padding import (
 from loopy.transform.ilp import realize_ilp
 from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
+from loopy.transform.save import save_and_reload_temporaries
 
 # }}}
 
-from loopy.preprocess import (preprocess_kernel, realize_reduction,
-        infer_unknown_types)
+from loopy.type_inference import infer_unknown_types
+from loopy.preprocess import preprocess_kernel, realize_reduction
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
-        get_gmem_access_poly,
-        get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
-        sum_mem_access_to_bytes,
+from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op,
+        MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
+        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
+        get_synchronization_poly, get_synchronization_map,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
@@ -130,7 +131,7 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
         parse_fortran)
 
 from loopy.target import TargetBase, ASTBuilderBase
-from loopy.target.c import CTarget
+from loopy.target.c import CTarget, generate_header
 from loopy.target.cuda import CudaTarget
 from loopy.target.opencl import OpenCLTarget
 from loopy.target.pyopencl import PyOpenCLTarget
@@ -206,6 +207,8 @@ __all__ = [
 
         "assume", "fix_parameters",
 
+        "save_and_reload_temporaries",
+
         # }}}
 
         "get_dot_dependency_graph",
@@ -213,16 +216,18 @@ __all__ = [
         "add_dtypes",
         "add_and_infer_dtypes",
 
-        "preprocess_kernel", "realize_reduction", "infer_unknown_types",
+        "infer_unknown_types",
+
+        "preprocess_kernel", "realize_reduction",
         "generate_loop_schedules", "get_one_scheduled_kernel",
         "GeneratedProgram", "CodeGenerationResult",
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
-        "get_DRAM_access_poly",
-        "get_synchronization_poly", "stringify_stats_mapping",
-        "sum_mem_access_to_bytes",
+        "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess",
+        "get_op_poly", "get_op_map", "get_lmem_access_poly",
+        "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
+        "get_synchronization_poly", "get_synchronization_map",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
@@ -236,7 +241,9 @@ __all__ = [
 
         "LoopyError", "LoopyWarning",
 
-        "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget",
+        "TargetBase",
+        "CTarget", "generate_header",
+        "CudaTarget", "OpenCLTarget",
         "PyOpenCLTarget", "ISPCTarget",
         "NumbaTarget", "NumbaCudaTarget",
         "ASTBuilderBase",
@@ -254,7 +261,6 @@ __all__ = [
         # }}}
         ]
 
-
 # }}}
 
 
@@ -274,6 +280,9 @@ def set_options(kernel, *args, **kwargs):
     new_opt = kernel.options.copy()
 
     if kwargs:
+        from loopy.options import _apply_legacy_map, Options
+        kwargs = _apply_legacy_map(Options._legacy_options_map, kwargs)
+
         for key, val in six.iteritems(kwargs):
             if not hasattr(new_opt, key):
                 raise ValueError("unknown option '%s'" % key)
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 479b898be610f6c9694be14f2095764ff14b767c..6a4d559758bc1d7ca52e9dc4da1b7e503e22cc29 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -109,7 +109,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
             ref_arg_data.append(None)
 
         elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg \
-            or arg.arg_class is ConstantArg:
+                or arg.arg_class is ConstantArg:
             if arg.shape is None or any(saxis is None for saxis in arg.shape):
                 raise LoopyError("array '%s' needs known shape to use automatic "
                         "testing" % arg.name)
@@ -422,7 +422,7 @@ def auto_test_vs_ref(
 
     # {{{ compile and run reference code
 
-    from loopy.preprocess import infer_unknown_types
+    from loopy.type_inference import infer_unknown_types
     ref_knl = infer_unknown_types(ref_knl, expect_completion=True)
 
     found_ref_device = False
@@ -530,7 +530,7 @@ def auto_test_vs_ref(
 
     test_kernel_count = 0
 
-    from loopy.preprocess import infer_unknown_types
+    from loopy.type_inference import infer_unknown_types
     for i, kernel in enumerate(test_kernels):
         test_kernel_count += 1
         if test_kernel_count > max_test_kernel_count:
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 79d824a44fc04f479139f2797994621f09798297..6f312ec798e13fa4b1d183c27578089857b13e3d 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -25,7 +25,7 @@ THE SOFTWARE.
 import six
 
 from loopy.diagnostic import LoopyError, warn
-from pytools import Record
+from pytools import ImmutableRecord
 import islpy as isl
 
 from pytools.persistent_dict import PersistentDict
@@ -38,7 +38,7 @@ logger = logging.getLogger(__name__)
 
 # {{{ implemented data info
 
-class ImplementedDataInfo(Record):
+class ImplementedDataInfo(ImmutableRecord):
     """
     .. attribute:: name
 
@@ -91,7 +91,7 @@ class ImplementedDataInfo(Record):
         from loopy.types import LoopyType
         assert isinstance(dtype, LoopyType)
 
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 name=name,
                 dtype=dtype,
                 arg_class=arg_class,
@@ -127,7 +127,7 @@ class VectorizationInfo(object):
         self.space = space
 
 
-class SeenFunction(Record):
+class SeenFunction(ImmutableRecord):
     """
     .. attribute:: name
     .. attribute:: c_name
@@ -137,15 +137,11 @@ class SeenFunction(Record):
     """
 
     def __init__(self, name, c_name, arg_dtypes):
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 name=name,
                 c_name=c_name,
                 arg_dtypes=arg_dtypes)
 
-    def __hash__(self):
-        return hash((type(self),)
-                + tuple((f, getattr(self, f)) for f in type(self).fields))
-
 
 class CodeGenerationState(object):
     """
@@ -365,7 +361,7 @@ code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
-class PreambleInfo(Record):
+class PreambleInfo(ImmutableRecord):
     """
     .. attribute:: kernel
     .. attribute:: seen_dtypes
@@ -409,7 +405,7 @@ def generate_code_v2(kernel):
 
     # }}}
 
-    from loopy.preprocess import infer_unknown_types
+    from loopy.type_inference import infer_unknown_types
     kernel = infer_unknown_types(kernel, expect_completion=True)
 
     from loopy.check import pre_codegen_checks
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index fb254bd54480f716de54de96f6aab9a4bb427767..7cc381f11d1239cba5656a9dc7a04cddaa14a368 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -27,30 +27,24 @@ import islpy as isl
 from islpy import dim_type
 
 
-# {{{ bounds check generator
+# {{{ approximate, convex bounds check generator
 
-def get_bounds_checks(domain, check_inames, implemented_domain,
-        overapproximate):
+def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domain):
     if isinstance(domain, isl.BasicSet):
         domain = isl.Set.from_basic_set(domain)
     domain = domain.remove_redundancies()
     result = domain.eliminate_except(check_inames, [dim_type.set])
 
-    if overapproximate:
-        # This is ok, because we're really looking for the
-        # projection, with no remaining constraints from
-        # the eliminated variables.
-        result = result.remove_divs()
-    else:
-        result = result.compute_divs()
+    # This is ok, because we're really looking for the
+    # projection, with no remaining constraints from
+    # the eliminated variables.
+    result = result.remove_divs()
 
     result, implemented_domain = isl.align_two(result, implemented_domain)
     result = result.gist(implemented_domain)
 
-    if overapproximate:
-        result = result.remove_divs()
-    else:
-        result = result.compute_divs()
+    # (see above)
+    result = result.remove_divs()
 
     from loopy.isl_helpers import convexify
     result = convexify(result)
@@ -62,23 +56,33 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
 # {{{ on which inames may a conditional depend?
 
 def get_usable_inames_for_conditional(kernel, sched_index):
-    from loopy.schedule import EnterLoop, LeaveLoop
+    from loopy.schedule import (
+        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
     from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag
 
-    result = set()
+    result = find_active_inames_at(kernel, sched_index)
+    crosses_barrier = has_barrier_within(kernel, sched_index)
 
-    for i, sched_item in enumerate(kernel.schedule):
-        if i >= sched_index:
-            break
-        if isinstance(sched_item, EnterLoop):
-            result.add(sched_item.iname)
-        elif isinstance(sched_item, LeaveLoop):
-            result.remove(sched_item.iname)
+    # Find our containing subkernel, grab inames for all insns from there.
 
-    for iname in kernel.all_inames():
+    subkernel_index = sched_index
+    from loopy.schedule import CallKernel
+
+    while not isinstance(kernel.schedule[subkernel_index], CallKernel):
+        subkernel_index -= 1
+
+    insn_ids_for_subkernel = get_insn_ids_for_block_at(
+        kernel.schedule, subkernel_index)
+
+    inames_for_subkernel = (
+        iname
+        for insn in insn_ids_for_subkernel
+        for iname in kernel.insn_inames(insn))
+
+    for iname in inames_for_subkernel:
         tag = kernel.iname_to_tag.get(iname)
 
-        # Parallel inames are always defined, BUT:
+        # Parallel inames are defined within a subkernel, BUT:
         #
         # - local indices may not be used in conditionals that cross barriers.
         #
@@ -87,7 +91,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
 
         if (
                 isinstance(tag, ParallelTag)
-                and not isinstance(tag, LocalIndexTagBase)
+                and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
                 and not isinstance(tag, IlpBaseTag)
                 ):
             result.add(iname)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 3378ed81ee56f97cc11f8f8998aeb67221061633..d206faad5bd84e3a1c7e7c061673f3d5d1144c84 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -150,8 +150,15 @@ def generate_code_for_sched_index(codegen_state, sched_index):
         return func(codegen_state, sched_index)
 
     elif isinstance(sched_item, Barrier):
-        return codegen_state.ast_builder.emit_barrier(
-                sched_item.kind, sched_item.comment)
+        if codegen_state.is_generating_device_code:
+            return codegen_state.ast_builder.emit_barrier(
+                    sched_item.kind, sched_item.comment)
+        from loopy.codegen.result import CodeGenerationResult
+        return CodeGenerationResult(
+                host_program=None,
+                device_programs=[],
+                implemented_domains={},
+                implemented_data_info=codegen_state.implemented_data_info)
 
     elif isinstance(sched_item, RunInstruction):
         insn = kernel.id_to_insn[sched_item.insn_id]
@@ -248,9 +255,9 @@ def build_loop_nest(codegen_state, schedule_index):
 
     # {{{ pass 2: find admissible conditional inames for each sibling schedule item
 
-    from pytools import Record
+    from pytools import ImmutableRecord
 
-    class ScheduleIndexInfo(Record):
+    class ScheduleIndexInfo(ImmutableRecord):
         """
         .. attribute:: schedule_index
         .. attribute:: admissible_cond_inames
@@ -301,13 +308,11 @@ def build_loop_nest(codegen_state, schedule_index):
             domain = isl.align_spaces(
                     self.kernel.get_inames_domain(check_inames),
                     self.impl_domain, obj_bigger_ok=True)
-            from loopy.codegen.bounds import get_bounds_checks
-            return get_bounds_checks(domain,
-                    check_inames, self.impl_domain,
-
-                    # Each instruction individually gets its bounds checks,
-                    # so we can safely overapproximate here.
-                    overapproximate=True)
+            from loopy.codegen.bounds import get_approximate_convex_bounds_checks
+            # Each instruction individually gets its bounds checks,
+            # so we can safely overapproximate here.
+            return get_approximate_convex_bounds_checks(domain,
+                    check_inames, self.impl_domain)
 
     def build_insn_group(sched_index_info_entries, codegen_state,
             done_group_lengths=set()):
@@ -318,6 +323,8 @@ def build_loop_nest(codegen_state, schedule_index):
             recursive calls from doing anything about groups that are too small.
         """
 
+        from loopy.symbolic import get_dependencies
+
         # The rough plan here is that build_insn_group starts out with the
         # entirety of the current schedule item's downward siblings (i.e. all
         # the ones up to the next LeaveLoop). It will then iterate upward to
@@ -365,6 +372,11 @@ def build_loop_nest(codegen_state, schedule_index):
                     & sched_index_info_entries[candidate_group_length-1]
                     .required_predicates)
 
+            current_pred_set = frozenset(
+                    pred for pred in current_pred_set
+                    if get_dependencies(pred) & kernel.all_inames()
+                    <= current_iname_set)
+
             # {{{ see which inames are actually used in group
 
             # And only generate conditionals for those.
@@ -451,13 +463,13 @@ def build_loop_nest(codegen_state, schedule_index):
             # gen_code returns a list
 
             if bounds_checks or pred_checks:
-                from loopy.symbolic import constraint_to_expr
+                from loopy.symbolic import constraint_to_cond_expr
 
                 prev_gen_code = gen_code
 
                 def gen_code(inner_codegen_state):
                     condition_exprs = [
-                            constraint_to_expr(cns)
+                            constraint_to_cond_expr(cns)
                             for cns in bounds_checks] + [
                                 pred_chk for pred_chk in pred_checks]
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 140ec644731d570fac2e793f0c4e5ea004d165e6..c490abb6ed1635c135fc77468f27cd833b1d57b2 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 from six.moves import range
 import islpy as isl
+dim_type = isl.dim_type
 from loopy.codegen import Unvectorizable
 from loopy.codegen.result import CodeGenerationResult
 from pymbolic.mapper.stringifier import PREC_NONE
@@ -34,24 +35,27 @@ from pymbolic.mapper.stringifier import PREC_NONE
 
 def to_codegen_result(
         codegen_state, insn_id, domain, check_inames, required_preds, ast):
-    from loopy.codegen.bounds import get_bounds_checks
-    from loopy.symbolic import constraint_to_expr
-
-    bounds_checks = get_bounds_checks(
-            domain, check_inames,
-            codegen_state.implemented_domain, overapproximate=False)
-    bounds_check_set = isl.Set.universe(domain.get_space()) \
-            .add_constraints(bounds_checks)
-    bounds_check_set, new_implemented_domain = isl.align_two(
-            bounds_check_set, codegen_state.implemented_domain)
-    new_implemented_domain = new_implemented_domain & bounds_check_set
-
-    if bounds_check_set.is_empty():
+    # {{{ get bounds check
+
+    chk_domain = isl.Set.from_basic_set(domain)
+    chk_domain = chk_domain.remove_redundancies()
+    chk_domain = chk_domain.eliminate_except(check_inames, [dim_type.set])
+
+    chk_domain, implemented_domain = isl.align_two(
+            chk_domain, codegen_state.implemented_domain)
+    chk_domain = chk_domain.gist(implemented_domain)
+
+    # }}}
+
+    new_implemented_domain = implemented_domain & chk_domain
+
+    if chk_domain.is_empty():
         return None
 
-    condition_exprs = [
-            constraint_to_expr(cns)
-            for cns in bounds_checks]
+    condition_exprs = []
+    if not chk_domain.plain_is_universe():
+        from loopy.symbolic import set_to_cond_expr
+        condition_exprs.append(set_to_cond_expr(chk_domain))
 
     condition_exprs.extend(
             required_preds - codegen_state.implemented_predicates)
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 648c3fe6f5b748dcc47de5ac972bb82ce605a9a9..8ac963835ec12702f2010806d1d49062422318a2 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -386,48 +386,39 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
 
         _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]
 
-        from loopy.isl_helpers import (
-                static_min_of_pw_aff,
-                static_max_of_pw_aff)
-
         lbound = (
                 kernel.cache_manager.dim_min(
                     dom_and_slab, loop_iname_idx)
                 .gist(kernel.assumptions)
+                .gist(dom_and_slab.params())
                 .coalesce())
         ubound = (
             kernel.cache_manager.dim_max(
                 dom_and_slab, loop_iname_idx)
             .gist(kernel.assumptions)
+            .gist(dom_and_slab.params())
             .coalesce())
 
-        static_lbound = static_min_of_pw_aff(
-                lbound,
-                constants_only=False)
-        static_ubound = static_max_of_pw_aff(
-                ubound,
-                constants_only=False)
-
         # }}}
 
-        # {{{ find implemented slab, build inner code
+        # {{{ find implemented loop, build inner code
 
-        from loopy.isl_helpers import make_slab_from_bound_pwaffs
+        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
 
-        # impl_slab may be overapproximated
-        impl_slab = make_slab_from_bound_pwaffs(
+        # impl_loop may be overapproximated
+        impl_loop = make_loop_bounds_from_pwaffs(
                 dom_and_slab.space,
-                loop_iname, static_lbound, static_ubound)
+                loop_iname, lbound, ubound)
 
         for iname in moved_inames:
-            dt, idx = impl_slab.get_var_dict()[iname]
-            impl_slab = impl_slab.move_dims(
-                    dim_type.set, impl_slab.dim(dim_type.set),
+            dt, idx = impl_loop.get_var_dict()[iname]
+            impl_loop = impl_loop.move_dims(
+                    dim_type.set, impl_loop.dim(dim_type.set),
                     dt, idx, 1)
 
         new_codegen_state = (
                 codegen_state
-                .intersect(impl_slab)
+                .intersect(impl_loop)
                 .copy(kernel=intersect_kernel_with_slab(
                     kernel, slab, iname)))
 
@@ -438,21 +429,30 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
         if cmt is not None:
             result.append(codegen_state.ast_builder.emit_comment(cmt))
 
-        from loopy.symbolic import aff_to_expr
-
         astb = codegen_state.ast_builder
 
-        if (static_ubound - static_lbound).plain_is_zero():
+        zero = isl.PwAff.zero_on_domain(
+            isl.LocalSpace.from_space(
+                lbound.get_space()).domain())
+
+        from loopy.symbolic import pw_aff_to_expr
+
+        if (ubound - lbound).plain_is_equal(zero):
             # single-trip, generate just a variable assignment, not a loop
-            result.append(merge_codegen_results(codegen_state, [
+            inner = merge_codegen_results(codegen_state, [
                 astb.emit_initializer(
                     codegen_state,
                     kernel.index_dtype, loop_iname,
-                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i"),
+                    ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"),
                     is_const=True),
                 astb.emit_blank_line(),
                 inner,
-                ]))
+                ])
+            result.append(
+                    inner.with_new_ast(
+                        codegen_state,
+                        astb.ast_block_scope_class(
+                            inner.current_ast(codegen_state))))
 
         else:
             inner_ast = inner.current_ast(codegen_state)
@@ -461,7 +461,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
                     codegen_state,
                     astb.emit_sequential_loop(
                         codegen_state, loop_iname, kernel.index_dtype,
-                        static_lbound, static_ubound, inner_ast)))
+                        pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast)))
 
     return merge_codegen_results(codegen_state, result)
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 04fab05afdc38a8843a566e0e6e6b10098d6415c..4318ad71c1b16deeaac98f8408d5ca82f2de1714 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -23,7 +23,7 @@ THE SOFTWARE.
 """
 
 import six
-from pytools import Record
+from pytools import ImmutableRecord
 
 
 def process_preambles(preambles):
@@ -45,7 +45,7 @@ def process_preambles(preambles):
 
 # {{{ code generation result
 
-class GeneratedProgram(Record):
+class GeneratedProgram(ImmutableRecord):
     """
     .. attribute:: name
 
@@ -64,7 +64,7 @@ class GeneratedProgram(Record):
     """
 
 
-class CodeGenerationResult(Record):
+class CodeGenerationResult(ImmutableRecord):
     """
     .. attribute:: host_program
     .. attribute:: device_programs
@@ -207,6 +207,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
     codegen_result = None
 
     block_cls = codegen_state.ast_builder.ast_block_class
+    block_scope_cls = codegen_state.ast_builder.ast_block_scope_class
 
     for el in elements:
         if isinstance(el, CodeGenerationResult):
@@ -227,7 +228,8 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                         dev_program_names.add(dp.name)
 
             cur_ast = el.current_ast(codegen_state)
-            if isinstance(cur_ast, block_cls):
+            if (isinstance(cur_ast, block_cls)
+                    and not isinstance(cur_ast, block_scope_cls)):
                 ast_els.extend(cur_ast.contents)
             else:
                 ast_els.append(cur_ast)
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 89600102e09bb96173bb11db5c71d14dd3b2a206..29996d6c78b6fd99e52a750968291d0dd3d7c941 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -89,9 +89,7 @@ class StaticValueFindingError(LoopyError):
 
 
 class DependencyTypeInferenceFailure(TypeInferenceFailure):
-    def __init__(self, message, symbol):
-        TypeInferenceFailure.__init__(self, message)
-        self.symbol = symbol
+    pass
 
 
 class MissingBarrierError(LoopyError):
diff --git a/loopy/execution.py b/loopy/execution.py
index 802684247f9f95a4374838ddcfaaae0ddbadec2e..5680fdbfef614a0df1674a56842acd1869d14636 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -25,13 +25,13 @@ THE SOFTWARE.
 
 import six
 import numpy as np
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
 
 
 # {{{ object array argument packing
 
-class _PackingInfo(Record):
+class _PackingInfo(ImmutableRecord):
     """
     .. attribute:: name
     .. attribute:: sep_shape
@@ -160,7 +160,7 @@ class KernelExecutorBase(object):
 
             kernel = add_dtypes(kernel, var_to_dtype)
 
-            from loopy.preprocess import infer_unknown_types
+            from loopy.type_inference import infer_unknown_types
             kernel = infer_unknown_types(kernel, expect_completion=True)
 
         if kernel.schedule is None:
diff --git a/loopy/expression.py b/loopy/expression.py
index 991f4a93e30a76a09b527e4fd326cfafff5e7569..3269bc09f064f57857eaa5218c8370383e0f735e 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -25,14 +25,10 @@ THE SOFTWARE.
 
 import numpy as np
 
-from pymbolic.mapper import CombineMapper, RecursiveMapper
+from pymbolic.mapper import RecursiveMapper
 
-from loopy.tools import is_integer
-from loopy.types import NumpyType
 from loopy.codegen import Unvectorizable
-from loopy.diagnostic import (
-        LoopyError,
-        TypeInferenceFailure, DependencyTypeInferenceFailure)
+from loopy.diagnostic import LoopyError
 
 
 # type_context may be:
@@ -57,264 +53,6 @@ def dtype_to_type_context(target, dtype):
     return None
 
 
-# {{{ type inference
-
-class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, new_assignments=None):
-        """
-        :arg new_assignments: mapping from names to either
-            :class:`loopy.kernel.data.TemporaryVariable`
-            or
-            :class:`loopy.kernel.data.KernelArgument`
-            instances
-        """
-        self.kernel = kernel
-        if new_assignments is None:
-            new_assignments = {}
-        self.new_assignments = new_assignments
-
-    # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
-    # are Python-equal (for many common constants such as integers).
-
-    def with_assignments(self, names_to_vars):
-        new_ass = self.new_assignments.copy()
-        new_ass.update(names_to_vars)
-        return type(self)(self.kernel, new_ass)
-
-    @staticmethod
-    def combine(dtypes):
-        # dtypes may just be a generator expr
-        dtypes = list(dtypes)
-
-        from loopy.types import LoopyType, NumpyType
-        assert all(isinstance(dtype, LoopyType) for dtype in dtypes)
-
-        if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
-            from pytools import is_single_valued, single_valued
-            if not is_single_valued(dtypes):
-                raise TypeInferenceFailure(
-                        "Nothing known about operations between '%s'"
-                        % ", ".join(str(dt) for dt in dtypes))
-
-            return single_valued(dtypes)
-
-        dtypes = [dtype.dtype for dtype in dtypes]
-
-        result = dtypes.pop()
-        while dtypes:
-            other = dtypes.pop()
-
-            if result.fields is None and other.fields is None:
-                if (result, other) in [
-                        (np.int32, np.float32), (np.float32, np.int32)]:
-                    # numpy makes this a double. I disagree.
-                    result = np.dtype(np.float32)
-                else:
-                    result = (
-                            np.empty(0, dtype=result)
-                            + np.empty(0, dtype=other)
-                            ).dtype
-
-            elif result.fields is None and other.fields is not None:
-                # assume the non-native type takes over
-                # (This is used for vector types.)
-                result = other
-            elif result.fields is not None and other.fields is None:
-                # assume the non-native type takes over
-                # (This is used for vector types.)
-                pass
-            else:
-                if result is not other:
-                    raise TypeInferenceFailure(
-                            "nothing known about result of operation on "
-                            "'%s' and '%s'" % (result, other))
-
-        return NumpyType(result)
-
-    def map_sum(self, expr):
-        dtypes = []
-        small_integer_dtypes = []
-        for child in expr.children:
-            dtype = self.rec(child)
-            if is_integer(child) and abs(child) < 1024:
-                small_integer_dtypes.append(dtype)
-            else:
-                dtypes.append(dtype)
-
-        from pytools import all
-        if all(dtype.is_integral() for dtype in dtypes):
-            dtypes.extend(small_integer_dtypes)
-
-        return self.combine(dtypes)
-
-    map_product = map_sum
-
-    def map_quotient(self, expr):
-        n_dtype = self.rec(expr.numerator)
-        d_dtype = self.rec(expr.denominator)
-
-        if n_dtype.is_integral() and d_dtype.is_integral():
-            # both integers
-            return NumpyType(np.dtype(np.float64))
-
-        else:
-            return self.combine([n_dtype, d_dtype])
-
-    def map_constant(self, expr):
-        if is_integer(expr):
-            for tp in [np.int32, np.int64]:
-                iinfo = np.iinfo(tp)
-                if iinfo.min <= expr <= iinfo.max:
-                    return NumpyType(np.dtype(tp))
-
-            else:
-                raise TypeInferenceFailure("integer constant '%s' too large" % expr)
-
-        dt = np.asarray(expr).dtype
-        if hasattr(expr, "dtype"):
-            return NumpyType(expr.dtype)
-        elif isinstance(expr, np.number):
-            # Numpy types are sized
-            return NumpyType(np.dtype(type(expr)))
-        elif dt.kind == "f":
-            # deduce the smaller type by default
-            return NumpyType(np.dtype(np.float32))
-        elif dt.kind == "c":
-            if np.complex64(expr) == np.complex128(expr):
-                # (COMPLEX_GUESS_LOGIC)
-                # No precision is lost by 'guessing' single precision, use that.
-                # This at least covers simple cases like '1j'.
-                return NumpyType(np.dtype(np.complex64))
-
-            # Codegen for complex types depends on exactly correct types.
-            # Refuse temptation to guess.
-            raise TypeInferenceFailure("Complex constant '%s' needs to "
-                    "be sized for type inference " % expr)
-        else:
-            raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
-
-    def map_subscript(self, expr):
-        return self.rec(expr.aggregate)
-
-    def map_linear_subscript(self, expr):
-        return self.rec(expr.aggregate)
-
-    def map_call(self, expr, multiple_types_ok=False):
-        from pymbolic.primitives import Variable
-
-        identifier = expr.function
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        if identifier in ["indexof", "indexof_vec"]:
-            return self.kernel.index_dtype
-
-        arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
-
-        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if multiple_types_ok:
-            if mangle_result is not None:
-                return mangle_result.result_dtypes
-        else:
-            if mangle_result is not None:
-                if len(mangle_result.result_dtypes) != 1 and not multiple_types_ok:
-                    raise LoopyError("functions with more or fewer than one "
-                            "return value may only be used in direct assignments")
-
-                return mangle_result.result_dtypes[0]
-
-        raise RuntimeError("unable to resolve "
-                "function '%s' with %d given arguments"
-                % (identifier, len(arg_dtypes)))
-
-    def map_variable(self, expr):
-        if expr.name in self.kernel.all_inames():
-            return self.kernel.index_dtype
-
-        result = self.kernel.mangle_symbol(
-                self.kernel.target.get_device_ast_builder(),
-                expr.name)
-
-        if result is not None:
-            result_dtype, _ = result
-            return result_dtype
-
-        obj = self.new_assignments.get(expr.name)
-
-        if obj is None:
-            obj = self.kernel.arg_dict.get(expr.name)
-
-        if obj is None:
-            obj = self.kernel.temporary_variables.get(expr.name)
-
-        if obj is None:
-            raise TypeInferenceFailure("name not known in type inference: %s"
-                    % expr.name)
-
-        from loopy.kernel.data import TemporaryVariable, KernelArgument
-        import loopy as lp
-        if isinstance(obj, TemporaryVariable):
-            result = obj.dtype
-            if result is lp.auto:
-                raise DependencyTypeInferenceFailure(
-                        "temporary variable '%s'" % expr.name,
-                        expr.name)
-            else:
-                return result
-
-        elif isinstance(obj, KernelArgument):
-            result = obj.dtype
-            if result is None:
-                raise DependencyTypeInferenceFailure(
-                        "argument '%s'" % expr.name,
-                        expr.name)
-            else:
-                return result
-
-        else:
-            raise RuntimeError("unexpected type inference "
-                    "object type for '%s'" % expr.name)
-
-    map_tagged_variable = map_variable
-
-    def map_lookup(self, expr):
-        agg_result = self.rec(expr.aggregate)
-        field = agg_result.numpy_dtype.fields[expr.name]
-        dtype = field[0]
-        return NumpyType(dtype)
-
-    def map_comparison(self, expr):
-        # "bool" is unusable because OpenCL's bool has indeterminate memory
-        # format.
-        return NumpyType(np.dtype(np.int32))
-
-    map_logical_not = map_comparison
-    map_logical_and = map_comparison
-    map_logical_or = map_comparison
-
-    def map_group_hw_index(self, expr, *args):
-        return self.kernel.index_dtype
-
-    def map_local_hw_index(self, expr, *args):
-        return self.kernel.index_dtype
-
-    def map_reduction(self, expr, multiple_types_ok=False):
-        result = expr.operation.result_dtypes(
-                self.kernel, self.rec(expr.expr), expr.inames)
-
-        if multiple_types_ok:
-            return result
-
-        else:
-            if len(result) != 1 and not multiple_types_ok:
-                raise LoopyError("reductions with more or fewer than one "
-                        "return value may only be used in direct assignments")
-
-            return result[0]
-
-# }}}
-
-
 # {{{ vetorizability checker
 
 class VectorizabilityChecker(RecursiveMapper):
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index e657beecbc5453ae5b2390da5a958d2fc9a70771..602830de38e457c5ff4a55d7685dc346a7b4de35 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -102,7 +102,7 @@ def make_slab(space, iname, start, stop):
     return result
 
 
-def make_slab_from_bound_pwaffs(space, iname, lbound, ubound):
+def make_loop_bounds_from_pwaffs(space, iname, lbound, ubound):
     dt, pos = space.get_var_dict()[iname]
     iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos)
 
@@ -111,10 +111,10 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound):
     assert iname_pwaff.space == lbound.space
     assert iname_pwaff.space == ubound.space
 
-    return convexify(
-            iname_pwaff.ge_set(lbound)
-            &
-            iname_pwaff.le_set(ubound))
+    return (
+        iname_pwaff.ge_set(lbound)
+        &
+        iname_pwaff.le_set(ubound))
 
 # }}}
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d618d4b0dd9adc0ccb50827dce914571538a62a4..5b192934c8f56ccf364d1b9ba58d81fa8c28ff63 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -28,7 +28,7 @@ import six
 from six.moves import range, zip, intern
 
 import numpy as np
-from pytools import RecordWithoutPickling, Record, memoize_method
+from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method
 import islpy as isl
 from islpy import dim_type
 import re
@@ -83,7 +83,7 @@ class kernel_state:  # noqa
     SCHEDULED = 2
 
 
-class LoopKernel(RecordWithoutPickling):
+class LoopKernel(ImmutableRecordWithoutPickling):
     """These correspond more or less directly to arguments of
     :func:`loopy.make_kernel`.
 
@@ -259,7 +259,7 @@ class LoopKernel(RecordWithoutPickling):
         # }}}
 
         from loopy.types import to_loopy_type
-        index_dtype = to_loopy_type(index_dtype).with_target(target)
+        index_dtype = to_loopy_type(index_dtype, target=target)
         if not index_dtype.is_integral():
             raise TypeError("index_dtype must be an integer")
         if np.iinfo(index_dtype.numpy_dtype).min >= 0:
@@ -279,7 +279,7 @@ class LoopKernel(RecordWithoutPickling):
         assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)
         assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT
 
-        RecordWithoutPickling.__init__(self,
+        ImmutableRecordWithoutPickling.__init__(self,
                 domains=domains,
                 instructions=instructions,
                 args=args,
@@ -889,7 +889,7 @@ class LoopKernel(RecordWithoutPickling):
                     dom_intersect_assumptions, iname_idx)
                 .coalesce())
 
-        class BoundsRecord(Record):
+        class BoundsRecord(ImmutableRecord):
             pass
 
         size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
@@ -1056,6 +1056,19 @@ class LoopKernel(RecordWithoutPickling):
 
     # }}}
 
+    # {{{ nosync sets
+
+    @memoize_method
+    def get_nosync_set(self, insn_id, scope):
+        assert scope in ("local", "global")
+
+        return frozenset(
+            insn_id
+            for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with
+            if nosync_scope == scope or nosync_scope == "any")
+
+    # }}}
+
     # {{{ pretty-printing
 
     def stringify(self, what=None, with_dependencies=False):
@@ -1213,7 +1226,9 @@ class LoopKernel(RecordWithoutPickling):
                     options.append(
                             "conflicts=%s" % ":".join(insn.conflicts_with_groups))
                 if insn.no_sync_with:
-                    options.append("no_sync_with=%s" % ":".join(insn.no_sync_with))
+                    # FIXME: Find a syntax to express scopes.
+                    options.append("no_sync_with=%s" % ":".join(id for id, _ in
+                                                                insn.no_sync_with))
 
                 if lhs:
                     core = "%s <- %s" % (
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 99bbc7bf9e782fe7995c20d8d3482602c9874dc9..a02fc58d97f370d45f36a465c38fa3caf3da9d41 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -30,7 +30,7 @@ import six
 from six.moves import range, zip
 from six import iteritems
 
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
 
 import numpy as np  # noqa
 
@@ -40,7 +40,7 @@ from loopy.tools import is_integer
 
 # {{{ array dimension tags
 
-class ArrayDimImplementationTag(Record):
+class ArrayDimImplementationTag(ImmutableRecord):
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
         :class:`pytools.persistent_dict.PersistentDict`.
@@ -544,7 +544,7 @@ def _parse_shape_or_strides(x):
     return tuple(_pymbolic_parse_if_necessary(xi) for xi in x)
 
 
-class ArrayBase(Record):
+class ArrayBase(ImmutableRecord):
     """
     .. attribute :: name
 
@@ -576,6 +576,7 @@ class ArrayBase(Record):
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
             dim_names=None, strides=None, order=None, for_atomic=False,
+            target=None,
             **kwargs):
         """
         All of the following are optional. Specify either strides or shape.
@@ -659,7 +660,7 @@ class ArrayBase(Record):
 
         from loopy.types import to_loopy_type
         dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True,
-                for_atomic=for_atomic)
+                for_atomic=for_atomic, target=target)
 
         strides_known = strides is not None and strides is not lp.auto
         shape_known = shape is not None and shape is not lp.auto
@@ -786,7 +787,7 @@ class ArrayBase(Record):
             warn("dim_names is not a tuple when calling ArrayBase constructor",
                     DeprecationWarning, stacklevel=2)
 
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 name=name,
                 dtype=dtype,
                 shape=shape,
@@ -1162,7 +1163,7 @@ class ArrayBase(Record):
 
 # {{{ access code generation
 
-class AccessInfo(Record):
+class AccessInfo(ImmutableRecord):
     """
     .. attribute:: array_name
     .. attribute:: vector_index
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index ff3bf16bcf32b26b1865d350aefbef80ec4e4554..6c5491384d4fc37dc48604aa52753d11ac10fc55 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -149,9 +149,9 @@ def expand_defines_in_expr(expr, defines):
 
 def get_default_insn_options_dict():
     return {
-        "depends_on": None,
+        "depends_on": frozenset(),
         "depends_on_is_final": False,
-        "no_sync_with": None,
+        "no_sync_with": frozenset(),
         "groups": frozenset(),
         "conflicts_with_groups": frozenset(),
         "insn_id": None,
@@ -221,18 +221,37 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
                 result["depends_on_is_final"] = True
                 opt_value = (opt_value[1:]).strip()
 
-            result["depends_on"] = frozenset(
+            result["depends_on"] = result["depends_on"].union(frozenset(
                     intern(dep.strip()) for dep in opt_value.split(":")
-                    if dep.strip())
+                    if dep.strip()))
+
+        elif opt_key == "dep_query" and opt_value is not None:
+            from loopy.match import parse_match
+            match = parse_match(opt_value)
+            result["depends_on"] = result["depends_on"].union(frozenset([match]))
 
         elif opt_key == "nosync" and opt_value is not None:
             if is_with_block:
                 raise LoopyError("'nosync' option may not be specified "
                         "in a 'with' block")
 
-            result["no_sync_with"] = frozenset(
-                    intern(dep.strip()) for dep in opt_value.split(":")
-                    if dep.strip())
+            # TODO: Come up with a syntax that allows the user to express
+            # different synchronization scopes.
+            result["no_sync_with"] = result["no_sync_with"].union(frozenset(
+                    (intern(dep.strip()), "any")
+                    for dep in opt_value.split(":") if dep.strip()))
+
+        elif opt_key == "nosync_query" and opt_value is not None:
+            if is_with_block:
+                raise LoopyError("'nosync' option may not be specified "
+                        "in a 'with' block")
+
+            from loopy.match import parse_match
+            match = parse_match(opt_value)
+            # TODO: Come up with a syntax that allows the user to express
+            # different synchronization scopes.
+            result["no_sync_with"] = result["no_sync_with"].union(
+                    frozenset([(match, "any")]))
 
         elif opt_key == "groups" and opt_value is not None:
             result["groups"] = frozenset(
@@ -555,10 +574,16 @@ def parse_instructions(instructions, defines):
             continue
 
         elif isinstance(insn, InstructionBase):
+            def intern_if_str(s):
+                if isinstance(s, str):
+                    return intern(s)
+                else:
+                    return s
+
             new_instructions.append(
                     insn.copy(
                         id=intern(insn.id) if isinstance(insn.id, str) else insn.id,
-                        depends_on=frozenset(intern(dep) for dep in insn.depends_on),
+                        depends_on=frozenset(intern_if_str(dep) for dep in insn.depends_on),
                         groups=frozenset(intern(grp) for grp in insn.groups),
                         conflicts_with_groups=frozenset(
                             intern(grp) for grp in insn.conflicts_with_groups),
@@ -1244,7 +1269,8 @@ def create_temporaries(knl, default_order):
                         scope=lp.auto,
                         base_indices=lp.auto,
                         shape=lp.auto,
-                        order=default_order)
+                        order=default_order,
+                        target=knl.target)
 
                 if isinstance(insn, Assignment):
                     insn = insn.copy(temp_var_type=None)
@@ -1412,43 +1438,40 @@ def apply_default_order_to_args(kernel, default_order):
 # }}}
 
 
-# {{{ resolve wildcard insn dependencies
-
-def find_matching_insn_ids(knl, dep):
-    from fnmatch import fnmatchcase
+# {{{ resolve instruction dependencies
 
-    return [
-        other_insn.id
-        for other_insn in knl.instructions
-        if fnmatchcase(other_insn.id, dep)]
+def _resolve_dependencies(knl, insn, deps):
+    from loopy import find_instructions
+    from loopy.match import MatchExpressionBase
 
-
-def resove_wildcard_insn_ids(knl, deps):
     new_deps = []
-    for dep in deps:
-        matches = find_matching_insn_ids(knl, dep)
 
-        if matches:
-            new_deps.extend(matches)
+    for dep in deps:
+        if isinstance(dep, MatchExpressionBase):
+            for new_dep in find_instructions(knl, dep):
+                if new_dep.id != insn.id:
+                    new_deps.append(new_dep.id)
         else:
-            # Uh, best we can do
-            new_deps.append(dep)
+            from fnmatch import fnmatchcase
+            for other_insn in knl.instructions:
+                if fnmatchcase(other_insn.id, dep):
+                    new_deps.append(other_insn.id)
 
     return frozenset(new_deps)
 
 
-def resolve_wildcard_deps(knl):
+def resolve_dependencies(knl):
     new_insns = []
 
     for insn in knl.instructions:
-        if insn.depends_on is not None:
-            insn = insn.copy(
-                    depends_on=resove_wildcard_insn_ids(knl, insn.depends_on),
-                    no_sync_with=resove_wildcard_insn_ids(
-                        knl, insn.no_sync_with),
-                    )
-
-        new_insns.append(insn)
+        new_insns.append(insn.copy(
+                    depends_on=_resolve_dependencies(knl, insn, insn.depends_on),
+                    no_sync_with=frozenset(
+                        (resolved_insn_id, nosync_scope)
+                        for nosync_dep, nosync_scope in insn.no_sync_with
+                        for resolved_insn_id in
+                        _resolve_dependencies(knl, insn, nosync_dep)),
+                    ))
 
     return knl.copy(instructions=new_insns)
 
@@ -1785,7 +1808,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     knl = expand_defines_in_shapes(knl, defines)
     knl = guess_arg_shape_if_requested(knl, default_order)
     knl = apply_default_order_to_args(knl, default_order)
-    knl = resolve_wildcard_deps(knl)
+    knl = resolve_dependencies(knl)
     knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=False)
 
     # -------------------------------------------------------------------------
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 004fae7f9664ff62c34a994671ea792e4eddc836..61be55ca88b105f2cf58e8aeace09e9c20f54857 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 
 from six.moves import intern
 import numpy as np  # noqa
-from pytools import Record
+from pytools import ImmutableRecord
 from loopy.kernel.array import ArrayBase
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import (  # noqa
@@ -54,7 +54,7 @@ class auto(object):  # noqa
 
 # {{{ iname tags
 
-class IndexTag(Record):
+class IndexTag(ImmutableRecord):
     __slots__ = []
 
     def __hash__(self):
@@ -93,7 +93,7 @@ class AxisTag(UniqueTag):
     __slots__ = ["axis"]
 
     def __init__(self, axis):
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 axis=axis)
 
     @property
@@ -197,21 +197,24 @@ def parse_tag(tag):
 
 # {{{ arguments
 
-class KernelArgument(Record):
+class KernelArgument(ImmutableRecord):
     """Base class for all argument types"""
 
     def __init__(self, **kwargs):
         kwargs["name"] = intern(kwargs.pop("name"))
 
+        target = kwargs.pop("target", None)
+
         dtype = kwargs.pop("dtype", None)
         from loopy.types import to_loopy_type
         kwargs["dtype"] = to_loopy_type(
-                dtype, allow_auto=True, allow_none=True)
+                dtype, allow_auto=True, allow_none=True, target=target)
 
-        Record.__init__(self, **kwargs)
+        ImmutableRecord.__init__(self, **kwargs)
 
 
 class GlobalArg(ArrayBase, KernelArgument):
+    __doc__ = ArrayBase.__doc__
     min_target_axes = 0
     max_target_axes = 1
 
@@ -221,6 +224,7 @@ class GlobalArg(ArrayBase, KernelArgument):
 
 
 class ConstantArg(ArrayBase, KernelArgument):
+    __doc__ = ArrayBase.__doc__
     min_target_axes = 0
     max_target_axes = 1
 
@@ -230,6 +234,7 @@ class ConstantArg(ArrayBase, KernelArgument):
 
 
 class ImageArg(ArrayBase, KernelArgument):
+    __doc__ = ArrayBase.__doc__
     min_target_axes = 1
     max_target_axes = 3
 
@@ -243,11 +248,11 @@ class ImageArg(ArrayBase, KernelArgument):
 
 
 class ValueArg(KernelArgument):
-    def __init__(self, name, dtype=None, approximately=1000):
-        from loopy.types import to_loopy_type
+    def __init__(self, name, dtype=None, approximately=1000, target=None):
         KernelArgument.__init__(self, name=name,
-                dtype=to_loopy_type(dtype, allow_auto=True, allow_none=True),
-                approximately=approximately)
+                dtype=dtype,
+                approximately=approximately,
+                target=target)
 
     def __str__(self):
         import loopy as lp
@@ -509,7 +514,7 @@ class TemporaryVariable(ArrayBase):
 
 # {{{ subsitution rule
 
-class SubstitutionRule(Record):
+class SubstitutionRule(ImmutableRecord):
     """
     .. attribute:: name
     .. attribute:: arguments
@@ -522,7 +527,7 @@ class SubstitutionRule(Record):
     def __init__(self, name, arguments, expression):
         assert isinstance(arguments, tuple)
 
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 name=name, arguments=arguments, expression=expression)
 
     def __str__(self):
@@ -543,7 +548,7 @@ class SubstitutionRule(Record):
 
 # {{{ function call mangling
 
-class CallMangleInfo(Record):
+class CallMangleInfo(ImmutableRecord):
     """
     .. attribute:: target_name
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index c54d1fc329a3a8797b17458dc40e489044e9374a..93642103e50da5aabfcb7bd86cc50ce6ff903a18 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -23,14 +23,14 @@ THE SOFTWARE.
 """
 
 from six.moves import intern
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
 from warnings import warn
 
 
 # {{{ instructions: base class
 
-class InstructionBase(Record):
+class InstructionBase(ImmutableRecord):
     """A base class for all types of instruction that can occur in
     a kernel.
 
@@ -51,6 +51,17 @@ class InstructionBase(Record):
 
         May be *None* to invoke the default.
 
+        There are two extensions to this:
+
+        - You may use `*` as a wildcard in the given IDs. This will be expanded
+          to all matching instruction IDs during :func:`loopy.make_kernel`.
+        - Instead of an instruction ID, you may pass an instance of
+          :class:`loopy.match.MatchExpressionBase` into the :attr:`depends_on`
+          :class:`frozenset`. The given expression will be used to add any
+          matching instructions in the kernel to :attr:`depends_on` during
+          :func:`loopy.make_kernel`. Note, that this is not meant as a user-facing
+          interface.
+
     .. attribute:: depends_on_is_final
 
         A :class:`bool` determining whether :attr:`depends_on` constitutes
@@ -80,9 +91,20 @@ class InstructionBase(Record):
 
     .. attribute:: no_sync_with
 
-        a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
-        with which no barrier synchronization is necessary, even given the existence
-        of a dependency chain and apparently conflicting access
+        a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where
+        `insn_id` refers to :attr:`id` of :class:`Instruction` instances
+        and `scope` is one of the following strings:
+
+           - `"local"`
+           - `"global"`
+           - `"any"`.
+
+        This indicates no barrier synchronization is necessary with the given
+        instruction using barriers of type `scope`, even given the existence of
+        a dependency chain and apparently conflicting access.
+
+        Note, that :attr:`no_sync_with` allows instruction matching through wildcards
+        and match expression, just like :attr:`depends_on`.
 
     .. rubric:: Conditionals
 
@@ -177,7 +199,7 @@ class InstructionBase(Record):
 
             new_predicates.add(pred)
 
-        predicates = new_predicates
+        predicates = frozenset(new_predicates)
         del new_predicates
 
         # }}}
@@ -233,7 +255,7 @@ class InstructionBase(Record):
         assert isinstance(groups, frozenset)
         assert isinstance(conflicts_with_groups, frozenset)
 
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -366,7 +388,10 @@ class InstructionBase(Record):
         if self.depends_on:
             result.append("dep="+":".join(self.depends_on))
         if self.no_sync_with:
-            result.append("nosync="+":".join(self.no_sync_with))
+            # TODO: Come up with a syntax to express different kinds of
+            # synchronization scopes.
+            result.append("nosync="+":".join(
+                    insn_id for insn_id, _ in self.no_sync_with))
         if self.groups:
             result.append("groups=%s" % ":".join(self.groups))
         if self.conflicts_with_groups:
@@ -382,19 +407,6 @@ class InstructionBase(Record):
 
     # {{{ comparison, hashing
 
-    def __eq__(self, other):
-        if not type(self) == type(other):
-            return False
-
-        for field_name in self.fields:
-            if getattr(self, field_name) != getattr(other, field_name):
-                return False
-
-        return True
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
         :class:`pytools.persistent_dict.PersistentDict`.
@@ -1159,7 +1171,7 @@ class CInstruction(InstructionBase):
                     for name, expr in self.iname_exprs],
                 assignees=[f(a, *args) for a in self.assignees],
                 predicates=frozenset(
-                    f(pred) for pred in self.predicates))
+                    f(pred, *args) for pred in self.predicates))
 
     # }}}
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 7e9bd549fede6abf6d4d5db99896063b34246793..cbacf5e284fe42ae2b5605d12fa3582bcf0ac4fd 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -70,7 +70,7 @@ def _add_dtypes(knl, dtype_dict):
     for arg in knl.args:
         new_dtype = dtype_dict.pop(arg.name, None)
         if new_dtype is not None:
-            new_dtype = to_loopy_type(new_dtype)
+            new_dtype = to_loopy_type(new_dtype, target=knl.target)
             if arg.dtype is not None and arg.dtype != new_dtype:
                 raise RuntimeError(
                         "argument '%s' already has a different dtype "
@@ -116,14 +116,14 @@ def add_and_infer_dtypes(knl, dtype_dict):
 
     knl = add_dtypes(knl, processed_dtype_dict)
 
-    from loopy.preprocess import infer_unknown_types
+    from loopy.type_inference import infer_unknown_types
     return infer_unknown_types(knl, expect_completion=True)
 
 
 def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
     knl = _add_dtypes_overdetermined(knl, dtype_dict)
 
-    from loopy.preprocess import infer_unknown_types
+    from loopy.type_inference import infer_unknown_types
     return infer_unknown_types(knl, expect_completion=True)
 
 # }}}
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 7d04b8c7330f88af9ee1d79fe19fd87b29b70050..b8633114ddeb9d48eb33a765755302917ca27f63 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -25,14 +25,14 @@ THE SOFTWARE.
 """
 
 
-from pytools import Record
+from pytools import ImmutableRecord
 from mako.template import Template
 import numpy as np
 
 
 # {{{ rng metadata
 
-class RNGInfo(Record):
+class RNGInfo(ImmutableRecord):
     @property
     def full_name(self):
         return "%s%dx%d" % (self.name, self.width, self.bits)
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 8a38eebd55b003c624b386bcdf296d2b97e2c97c..f435820b23e8da909f0cff14ff5a1272874e865f 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -37,6 +37,11 @@ class ReductionOperation(object):
     """
 
     def result_dtypes(self, target, arg_dtype, inames):
+        """
+        :arg arg_dtype: may be None if not known
+        :returns: None if not known, otherwise the returned type
+        """
+
         raise NotImplementedError
 
     def neutral_element(self, dtype, inames):
@@ -87,6 +92,9 @@ class ScalarReductionOperation(ReductionOperation):
             return (self.parse_result_type(
                     kernel.target, self.forced_result_type),)
 
+        if arg_dtype is None:
+            return None
+
         return (arg_dtype,)
 
     def __hash__(self):
diff --git a/loopy/match.py b/loopy/match.py
index 053fc9d4883d97b9184d85429aac3b6507d28e0e..ab0038af8dc5e9189a382bb76115998f57aef74e 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -58,6 +58,7 @@ def re_from_glob(s):
     from fnmatch import translate
     return re.compile("^"+translate(s.strip())+"$")
 
+
 # {{{ parsing
 
 # {{{ lexer data
@@ -72,7 +73,7 @@ _id = intern("_id")
 _tag = intern("_tag")
 _writes = intern("_writes")
 _reads = intern("_reads")
-_iname = intern("_reads")
+_iname = intern("_iname")
 
 _whitespace = intern("_whitespace")
 
@@ -107,6 +108,8 @@ _PREC_NOT = 30
 
 # }}}
 
+# }}}
+
 
 # {{{ match expression
 
@@ -137,6 +140,9 @@ class All(MatchExpressionBase):
     def __eq__(self, other):
         return (type(self) == type(other))
 
+    def __hash__(self):
+        return hash(type(self))
+
 
 class And(MatchExpressionBase):
     def __init__(self, children):
@@ -156,6 +162,9 @@ class And(MatchExpressionBase):
         return (type(self) == type(other)
                 and self.children == other.children)
 
+    def __hash__(self):
+        return hash((type(self), self.children))
+
 
 class Or(MatchExpressionBase):
     def __init__(self, children):
@@ -175,6 +184,9 @@ class Or(MatchExpressionBase):
         return (type(self) == type(other)
                 and self.children == other.children)
 
+    def __hash__(self):
+        return hash((type(self), self.children))
+
 
 class Not(MatchExpressionBase):
     def __init__(self, child):
@@ -194,6 +206,9 @@ class Not(MatchExpressionBase):
         return (type(self) == type(other)
                 and self.child == other.child)
 
+    def __hash__(self):
+        return hash((type(self), self.child))
+
 
 class GlobMatchExpressionBase(MatchExpressionBase):
     def __init__(self, glob):
@@ -215,6 +230,9 @@ class GlobMatchExpressionBase(MatchExpressionBase):
         return (type(self) == type(other)
                 and self.glob == other.glob)
 
+    def __hash__(self):
+        return hash((type(self), self.glob))
+
 
 class Id(GlobMatchExpressionBase):
     def __call__(self, kernel, matchable):
@@ -244,7 +262,7 @@ class Reads(GlobMatchExpressionBase):
 class Iname(GlobMatchExpressionBase):
     def __call__(self, kernel, matchable):
         return any(self.re.match(name)
-                for name in matchable.inames(kernel))
+                for name in matchable.within_inames)
 
 # }}}
 
@@ -350,8 +368,6 @@ def parse_match(expr):
 
 # }}}
 
-# }}}
-
 
 # {{{ stack match objects
 
diff --git a/loopy/options.py b/loopy/options.py
index 5db1be64624c027a6579f28c99db1bb4e78e3bc3..33b216e1ee7fe95b0930af2643ac3ccc5693a4ee 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -23,16 +23,49 @@ THE SOFTWARE.
 """
 
 
-from pytools import Record
+import six
+from pytools import ImmutableRecord
 import re
 
 
+ALLOW_TERMINAL_COLORS = False
+
+
 class _ColoramaStub(object):
     def __getattribute__(self, name):
         return ""
 
 
-class Options(Record):
+def _apply_legacy_map(lmap, kwargs):
+    result = {}
+
+    for name, val in six.iteritems(kwargs):
+        try:
+            lmap_value = lmap[name]
+        except KeyError:
+            new_name = name
+        else:
+            if lmap_value is None:
+                # ignore this
+                from warnings import warn
+                warn("option '%s' is deprecated and was ignored" % name,
+                        DeprecationWarning)
+                continue
+
+            new_name, translator = lmap_value
+            if name in result:
+                raise TypeError("may not pass a value for both '%s' and '%s'"
+                        % (name, new_name))
+
+            if translator is not None:
+                val = translator(val)
+
+        result[new_name] = val
+
+    return result
+
+
+class Options(ImmutableRecord):
     """
     Unless otherwise specified, these options are Boolean-valued
     (i.e. on/off).
@@ -91,30 +124,21 @@ class Options(Record):
         Accepts a file name as a value. Writes to
         ``sys.stdout`` if none is given.
 
-    .. attribute:: highlight_wrapper
-
-        Use syntax highlighting in :attr:`write_wrapper`.
-
-    .. attribute:: write_cl
-
-        Print the generated OpenCL kernel.
-        Accepts a file name as a value. Writes to
-        ``sys.stdout`` if none is given.
-
-    .. attribute:: highlight_cl
+    .. attribute:: write_code
 
-        Use syntax highlighting in :attr:`write_cl`.
+        Print the generated code.  Accepts a file name or a boolean as a value.
+        Writes to ``sys.stdout`` if set to *True*.
 
-    .. attribute:: edit_cl
+    .. attribute:: edit_code
 
         Invoke an editor (given by the environment variable
         :envvar:`EDITOR`) on the generated kernel code,
         allowing for tweaks before the code is passed on to
-        the OpenCL implementation for compilation.
+        the target for compilation.
 
-    .. attribute:: cl_build_options
+    .. attribute:: build_options
 
-        Options to pass to the OpenCL compiler when building the kernel.
+        Options to pass to the target compiler when building the kernel.
         A list of strings.
 
     .. attribute:: allow_terminal_colors
@@ -126,6 +150,16 @@ class Options(Record):
     .. attribute:: disable_global_barriers
     """
 
+    _legacy_options_map = {
+            "cl_build_options": ("build_options", None),
+            "write_cl": ("write_code", None),
+            "highlight_cl": None,
+            "highlight_wrapper": None,
+            "disable_wrapper_highlight": None,
+            "disable_code_highlight": None,
+            "edit_cl": ("edit_code", None),
+            }
+
     def __init__(
             # All Boolean flags in here should default to False for the
             # string-based interface of make_options (below) to make sense.
@@ -133,46 +167,65 @@ class Options(Record):
             # All defaults are further required to be False when cast to bool
             # for the update() functionality to work.
 
-            self,
-
-            annotate_inames=False,
-            trace_assignments=False,
-            trace_assignment_values=False,
-            ignore_boostable_into=False,
-
-            skip_arg_checks=False, no_numpy=False, return_dict=False,
-            write_wrapper=False, highlight_wrapper=False,
-            write_cl=False, highlight_cl=False,
-            edit_cl=False, cl_build_options=[],
-            allow_terminal_colors=None,
-            disable_global_barriers=False,
-            ):
-
-        if allow_terminal_colors is None:
-            try:
-                import colorama  # noqa
-            except ImportError:
-                allow_terminal_colors = False
-            else:
-                allow_terminal_colors = True
+            self, **kwargs):
+
+        kwargs = _apply_legacy_map(self._legacy_options_map, kwargs)
+
+        try:
+            import colorama  # noqa
+        except ImportError:
+            allow_terminal_colors_def = False
+        else:
+            allow_terminal_colors_def = True
+
+        allow_terminal_colors_def = (
+                ALLOW_TERMINAL_COLORS and allow_terminal_colors_def)
 
-        Record.__init__(
+        ImmutableRecord.__init__(
                 self,
 
-                annotate_inames=annotate_inames,
-                trace_assignments=trace_assignments,
-                trace_assignment_values=trace_assignment_values,
-                ignore_boostable_into=ignore_boostable_into,
-
-                skip_arg_checks=skip_arg_checks, no_numpy=no_numpy,
-                return_dict=return_dict,
-                write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper,
-                write_cl=write_cl, highlight_cl=highlight_cl,
-                edit_cl=edit_cl, cl_build_options=cl_build_options,
-                allow_terminal_colors=allow_terminal_colors,
-                disable_global_barriers=disable_global_barriers,
+                annotate_inames=kwargs.get("annotate_inames", False),
+                trace_assignments=kwargs.get("trace_assignments", False),
+                trace_assignment_values=kwargs.get("trace_assignment_values", False),
+                ignore_boostable_into=kwargs.get("ignore_boostable_into", False),
+
+                skip_arg_checks=kwargs.get("skip_arg_checks", False),
+                no_numpy=kwargs.get("no_numpy", False),
+                return_dict=kwargs.get("return_dict", False),
+                write_wrapper=kwargs.get("write_wrapper", False),
+                write_code=kwargs.get("write_code", False),
+                edit_code=kwargs.get("edit_code", False),
+                build_options=kwargs.get("build_options", []),
+                allow_terminal_colors=kwargs.get("allow_terminal_colors",
+                    allow_terminal_colors_def),
+                disable_global_barriers=kwargs.get("disable_global_barriers",
+                    False),
                 )
 
+    # {{{ legacy compatibility
+
+    @property
+    def edit_cl(self):
+        return self.edit_code
+
+    @property
+    def cl_build_options(self):
+        return self.build_options
+
+    @property
+    def highlight_cl(self):
+        return self.allow_terminal_colors
+
+    @property
+    def highlight_wrapper(self):
+        return self.allow_terminal_colors
+
+    @property
+    def write_cl(self):
+        return self.write_code
+
+    # }}}
+
     def update(self, other):
         for f in self.__class__.fields:
             setattr(self, f, getattr(self, f) or getattr(other, f))
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index c0f42e55aaf7710a8a91781cb2f0d0af905871dd..6b5488a20bc9d714fb5fde908b559ddebf4b9591 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -35,6 +35,8 @@ from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment
+# for the benefit of loopy.statistics, for now
+from loopy.type_inference import infer_unknown_types
 
 import logging
 logger = logging.getLogger(__name__)
@@ -70,6 +72,24 @@ def prepare_for_caching(kernel):
 # }}}
 
 
+# {{{ check for writes to predicates
+
+def check_for_writes_to_predicates(kernel):
+    from loopy.symbolic import get_dependencies
+    for insn in kernel.instructions:
+        pred_vars = (
+                frozenset.union(
+                    *(get_dependencies(pred) for pred in insn.predicates))
+                if insn.predicates else frozenset())
+        written_pred_vars = frozenset(insn.assignee_var_names()) & pred_vars
+        if written_pred_vars:
+            raise LoopyError("In instruction '%s': may not write to "
+                    "variable(s) '%s' involved in the instruction's predicates"
+                    % (insn.id, ", ".join(written_pred_vars)))
+
+# }}}
+
+
 # {{{ check reduction iname uniqueness
 
 def check_reduction_iname_uniqueness(kernel):
@@ -109,193 +129,6 @@ def check_reduction_iname_uniqueness(kernel):
 # }}}
 
 
-# {{{ infer types
-
-def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
-    if var_name in kernel.all_params():
-        return kernel.index_dtype, []
-
-    def debug(s):
-        logger.debug("%s: %s" % (kernel.name, s))
-
-    dtypes = []
-
-    import loopy as lp
-
-    symbols_with_unavailable_types = []
-
-    from loopy.diagnostic import DependencyTypeInferenceFailure
-    for writer_insn_id in kernel.writer_map().get(var_name, []):
-        writer_insn = kernel.id_to_insn[writer_insn_id]
-        if not isinstance(writer_insn, lp.MultiAssignmentBase):
-            continue
-
-        expr = subst_expander(writer_insn.expression)
-
-        try:
-            debug("             via expr %s" % expr)
-            if isinstance(writer_insn, lp.Assignment):
-                result = type_inf_mapper(expr)
-            elif isinstance(writer_insn, lp.CallInstruction):
-                result_dtypes = type_inf_mapper(expr, multiple_types_ok=True)
-
-                result = None
-                for assignee, comp_dtype in zip(
-                        writer_insn.assignee_var_names(), result_dtypes):
-                    if assignee == var_name:
-                        result = comp_dtype
-                        break
-
-                assert result is not None
-
-            debug("             result: %s" % result)
-
-            dtypes.append(result)
-
-        except DependencyTypeInferenceFailure as e:
-            debug("             failed: %s" % e)
-            symbols_with_unavailable_types.append(e.symbol)
-
-    if not dtypes:
-        return None, symbols_with_unavailable_types
-
-    result = type_inf_mapper.combine(dtypes)
-
-    return result, []
-
-
-class _DictUnionView:
-    def __init__(self, children):
-        self.children = children
-
-    def get(self, key):
-        try:
-            return self[key]
-        except KeyError:
-            return None
-
-    def __getitem__(self, key):
-        for ch in self.children:
-            try:
-                return ch[key]
-            except KeyError:
-                pass
-
-        raise KeyError(key)
-
-
-def infer_unknown_types(kernel, expect_completion=False):
-    """Infer types on temporaries and arguments."""
-
-    logger.debug("%s: infer types" % kernel.name)
-
-    def debug(s):
-        logger.debug("%s: %s" % (kernel.name, s))
-
-    unexpanded_kernel = kernel
-    if kernel.substitutions:
-        from loopy.transform.subst import expand_subst
-        kernel = expand_subst(kernel)
-
-    new_temp_vars = kernel.temporary_variables.copy()
-    new_arg_dict = kernel.arg_dict.copy()
-
-    # {{{ fill queue
-
-    # queue contains temporary variables
-    queue = []
-
-    import loopy as lp
-    for tv in six.itervalues(kernel.temporary_variables):
-        if tv.dtype is lp.auto:
-            queue.append(tv)
-
-    for arg in kernel.args:
-        if arg.dtype is None:
-            queue.append(arg)
-
-    # }}}
-
-    from loopy.expression import TypeInferenceMapper
-    type_inf_mapper = TypeInferenceMapper(kernel,
-            _DictUnionView([
-                new_temp_vars,
-                new_arg_dict
-                ]))
-
-    from loopy.symbolic import SubstitutionRuleExpander
-    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
-
-    # {{{ work on type inference queue
-
-    from loopy.kernel.data import TemporaryVariable, KernelArgument
-
-    failed_names = set()
-    while queue:
-        item = queue.pop(0)
-
-        debug("inferring type for %s %s" % (type(item).__name__, item.name))
-
-        result, symbols_with_unavailable_types = \
-                _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander)
-
-        failed = result is None
-        if not failed:
-            debug("     success: %s" % result)
-            if isinstance(item, TemporaryVariable):
-                new_temp_vars[item.name] = item.copy(dtype=result)
-            elif isinstance(item, KernelArgument):
-                new_arg_dict[item.name] = item.copy(dtype=result)
-            else:
-                raise LoopyError("unexpected item type in type inference")
-        else:
-            debug("     failure")
-
-        if failed:
-            if item.name in failed_names:
-                # this item has failed before, give up.
-                advice = ""
-                if symbols_with_unavailable_types:
-                    advice += (
-                            " (need type of '%s'--check for missing arguments)"
-                            % ", ".join(symbols_with_unavailable_types))
-
-                if expect_completion:
-                    raise LoopyError(
-                            "could not determine type of '%s'%s"
-                            % (item.name, advice))
-
-                else:
-                    # We're done here.
-                    break
-
-            # remember that this item failed
-            failed_names.add(item.name)
-
-            queue_names = set(qi.name for qi in queue)
-
-            if queue_names == failed_names:
-                # We did what we could...
-                print(queue_names, failed_names, item.name)
-                assert not expect_completion
-                break
-
-            # can't infer type yet, put back into queue
-            queue.append(item)
-        else:
-            # we've made progress, reset failure markers
-            failed_names = set()
-
-    # }}}
-
-    return unexpanded_kernel.copy(
-            temporary_variables=new_temp_vars,
-            args=[new_arg_dict[arg.name] for arg in kernel.args],
-            )
-
-# }}}
-
-
 # {{{ decide temporary scope
 
 def _get_compute_inames_tagged(kernel, insn, tag_base):
@@ -462,7 +295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
 
-    from loopy.expression import TypeInferenceMapper
+    from loopy.type_inference import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
     # {{{ sequential
@@ -626,7 +459,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset([init_id]) | insn.depends_on,
-                no_sync_with=frozenset([init_id]))
+                no_sync_with=frozenset([(init_id, "any")]))
         generated_insns.append(transfer_insn)
 
         def _strip_if_scalar(c):
@@ -684,7 +517,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             istage += 1
 
         new_insn_add_depends_on.add(prev_id)
-        new_insn_add_no_sync_with.add(prev_id)
+        new_insn_add_no_sync_with.add((prev_id, "any"))
         new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)
 
         if nresults == 1:
@@ -1061,6 +894,7 @@ def preprocess_kernel(kernel, device=None):
 
     kernel = infer_unknown_types(kernel, expect_completion=False)
 
+    check_for_writes_to_predicates(kernel)
     check_reduction_iname_uniqueness(kernel)
 
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index ffef62cf8b4f7cc65a3c11e06c7d42c23d18eafd..c8174d94cf9f86bde574b3e1eff353d26438cab8 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -24,7 +24,7 @@ THE SOFTWARE.
 
 
 import six
-from pytools import Record
+from pytools import ImmutableRecord
 import sys
 import islpy as isl
 from loopy.diagnostic import warn_with_kernel, LoopyError  # noqa
@@ -39,7 +39,7 @@ logger = logging.getLogger(__name__)
 
 # {{{ schedule items
 
-class ScheduleItem(Record):
+class ScheduleItem(ImmutableRecord):
     __slots__ = []
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -399,6 +399,17 @@ def get_priority_tiers(wanted, priorities):
     for tier in get_priority_tiers(wanted, priorities):
         yield tier
 
+
+def sched_item_to_insn_id(sched_item):
+    # Helper for use in generator expressions, i.e.
+    # (... for insn_id in sched_item_to_insn_id(item) ...)
+    if isinstance(sched_item, RunInstruction):
+        yield sched_item.insn_id
+    elif isinstance(sched_item, Barrier):
+        if (hasattr(sched_item, "originating_insn_id")
+                and sched_item.originating_insn_id is not None):
+            yield sched_item.originating_insn_id
+
 # }}}
 
 
@@ -541,7 +552,7 @@ class ScheduleDebugInput(Exception):
 
 # {{{ scheduling algorithm
 
-class SchedulerState(Record):
+class SchedulerState(ImmutableRecord):
     """
     .. attribute:: kernel
 
@@ -572,12 +583,37 @@ class SchedulerState(Record):
 
         A :class:`frozenset` of all inames ever entered.
 
+    .. attribute:: enclosing_subkernel_inames
+
+        The inames of the last entered subkernel
+
     .. attribute:: schedule
 
     .. attribute:: scheduled_insn_ids
 
     .. attribute:: unscheduled_insn_ids
 
+    .. attribute:: preschedule
+
+        A sequence of schedule items that must be inserted into the
+        schedule, maintaining the same ordering
+
+    .. attribute:: prescheduled_insn_ids
+
+        A :class:`frozenset` of any instruction that started prescheduled
+
+    .. attribute:: prescheduled_inames
+
+        A :class:`frozenset` of any iname that started prescheduled
+
+    .. attribute:: may_schedule_global_barriers
+
+        Whether global barrier scheduling is allowed
+
+    .. attribute:: within_subkernel
+
+        Whether the scheduler is inside a subkernel
+
     .. attribute:: group_insn_counts
 
         A mapping from instruction group names to the number of instructions
@@ -619,6 +655,11 @@ def generate_loop_schedules_internal(
 
     active_inames_set = frozenset(sched_state.active_inames)
 
+    next_preschedule_item = (
+        sched_state.preschedule[0]
+        if len(sched_state.preschedule) > 0
+        else None)
+
     # {{{ decide about debug mode
 
     debug_mode = False
@@ -637,6 +678,10 @@ def generate_loop_schedules_internal(
         print(75*"=")
         print("CURRENT SCHEDULE:")
         print(dump_schedule(sched_state.kernel, sched_state.schedule))
+        if sched_state.preschedule:
+            print(75*"=")
+            print("PRESCHEDULED ITEMS AWAITING SCHEDULING:")
+            print(dump_schedule(sched_state.kernel, sched_state.preschedule))
         #print("boost allowed:", allow_boost)
         print(75*"=")
         print("LOOP NEST MAP (inner: outer):")
@@ -652,6 +697,54 @@ def generate_loop_schedules_internal(
 
     # }}}
 
+    # {{{ see if we have reached the start/end of kernel in the preschedule
+
+    if isinstance(next_preschedule_item, CallKernel):
+        assert sched_state.within_subkernel is False
+        for result in generate_loop_schedules_internal(
+                sched_state.copy(
+                    schedule=sched_state.schedule + (next_preschedule_item,),
+                    preschedule=sched_state.preschedule[1:],
+                    within_subkernel=True,
+                    may_schedule_global_barriers=False,
+                    enclosing_subkernel_inames=sched_state.active_inames),
+                allow_boost=rec_allow_boost,
+                debug=debug):
+            yield result
+
+    if isinstance(next_preschedule_item, ReturnFromKernel):
+        assert sched_state.within_subkernel is True
+        # Make sure all subkernel inames have finished.
+        if sched_state.active_inames == sched_state.enclosing_subkernel_inames:
+            for result in generate_loop_schedules_internal(
+                    sched_state.copy(
+                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        preschedule=sched_state.preschedule[1:],
+                        within_subkernel=False,
+                        may_schedule_global_barriers=True),
+                    allow_boost=rec_allow_boost,
+                    debug=debug):
+                yield result
+
+    # }}}
+
+    # {{{ see if there are pending local barriers in the preschedule
+
+    # Local barriers do not have associated instructions, so they need to
+    # be handled separately from instructions.
+    if (
+            isinstance(next_preschedule_item, Barrier)
+            and next_preschedule_item.kind == "local"):
+        for result in generate_loop_schedules_internal(
+                    sched_state.copy(
+                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        preschedule=sched_state.preschedule[1:]),
+                    allow_boost=rec_allow_boost,
+                    debug=debug):
+                yield result
+
+    # }}}
+
     # {{{ see if any insns are ready to be scheduled now
 
     # Also take note of insns that have a chance of being schedulable inside
@@ -667,9 +760,16 @@ def generate_loop_schedules_internal(
         # schedule generation order.
         return (insn.priority, len(active_groups & insn.groups), insn.id)
 
-    insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids,
+    insn_ids_to_try = sorted(
+            # Non-prescheduled instructions go first.
+            sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids,
             key=insn_sort_key, reverse=True)
 
+    insn_ids_to_try.extend(
+        insn_id
+        for item in sched_state.preschedule
+        for insn_id in sched_item_to_insn_id(item))
+
     for insn_id in insn_ids_to_try:
         insn = kernel.id_to_insn[insn_id]
 
@@ -705,6 +805,46 @@ def generate_loop_schedules_internal(
                     print("instruction '%s' won't work under inames '%s'"
                             % (format_insn(kernel, insn.id), ",".join(have-want)))
 
+        # {{{ check if scheduling this insn is compatible with preschedule
+
+        if insn_id in sched_state.prescheduled_insn_ids:
+            if isinstance(next_preschedule_item, RunInstruction):
+                next_preschedule_insn_id = next_preschedule_item.insn_id
+            elif (
+                    isinstance(next_preschedule_item, Barrier)
+                    and next_preschedule_item.kind == "global"):
+                assert hasattr(next_preschedule_item, "originating_insn_id")
+                assert next_preschedule_item.originating_insn_id is not None
+                next_preschedule_insn_id = next_preschedule_item.originating_insn_id
+            else:
+                next_preschedule_insn_id = None
+
+            if next_preschedule_insn_id != insn_id:
+                if debug_mode:
+                    print("can't schedule '%s' because another preschedule "
+                          "instruction precedes it" % format_insn(kernel, insn.id))
+                is_ready = False
+
+        # }}}
+
+        # {{{ check if scheduler state allows insn scheduling
+
+        from loopy.kernel.instruction import BarrierInstruction
+        if isinstance(insn, BarrierInstruction) and insn.kind == "global":
+            if not sched_state.may_schedule_global_barriers:
+                if debug_mode:
+                    print("can't schedule '%s' because global barriers are "
+                          "not currently allowed" % format_insn(kernel, insn.id))
+                is_ready = False
+        else:
+            if not sched_state.within_subkernel:
+                if debug_mode:
+                    print("can't schedule '%s' because not within subkernel"
+                          % format_insn(kernel, insn.id))
+                is_ready = False
+
+        # }}}
+
         # {{{ determine group-based readiness
 
         if insn.conflicts_with_groups & active_groups:
@@ -761,6 +901,10 @@ def generate_loop_schedules_internal(
                     unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set,
                     schedule=(
                         sched_state.schedule + (RunInstruction(insn_id=insn.id),)),
+                    preschedule=(
+                        sched_state.preschedule
+                        if insn_id not in sched_state.prescheduled_insn_ids
+                        else sched_state.preschedule[1:]),
                     active_group_counts=new_active_group_counts,
                     uses_of_boostability=(
                         sched_state.uses_of_boostability
@@ -790,7 +934,17 @@ def generate_loop_schedules_internal(
     if last_entered_loop is not None:
         can_leave = True
 
-        if last_entered_loop not in sched_state.breakable_inames:
+        if (
+                last_entered_loop in sched_state.prescheduled_inames
+                and not (
+                    isinstance(next_preschedule_item, LeaveLoop)
+                    and next_preschedule_item.iname == last_entered_loop)):
+            # A prescheduled loop can only be left if the preschedule agrees.
+            if debug_mode:
+                print("cannot leave '%s' because of preschedule constraints"
+                      % last_entered_loop)
+            can_leave = False
+        elif last_entered_loop not in sched_state.breakable_inames:
             # If the iname is not breakable, then check that we've
             # scheduled all the instructions that require it.
 
@@ -857,12 +1011,19 @@ def generate_loop_schedules_internal(
                         break
 
             if can_leave and not debug_mode:
+
                 for sub_sched in generate_loop_schedules_internal(
                         sched_state.copy(
                             schedule=(
                                 sched_state.schedule
                                 + (LeaveLoop(iname=last_entered_loop),)),
-                            active_inames=sched_state.active_inames[:-1]),
+                            active_inames=sched_state.active_inames[:-1],
+                            preschedule=(
+                                sched_state.preschedule
+                                if last_entered_loop
+                                not in sched_state.prescheduled_inames
+                                else sched_state.preschedule[1:]),
+                        ),
                         allow_boost=rec_allow_boost, debug=debug):
                     yield sub_sched
 
@@ -902,6 +1063,38 @@ def generate_loop_schedules_internal(
 
             # {{{ check if scheduling this iname now is allowed/plausible
 
+            if (
+                    iname in sched_state.prescheduled_inames
+                    and not (
+                        isinstance(next_preschedule_item, EnterLoop)
+                        and next_preschedule_item.iname == iname)):
+                if debug_mode:
+                    print("scheduling %s prohibited by preschedule constraints"
+                          % iname)
+                continue
+
+            if (
+                    not sched_state.within_subkernel
+                    and iname not in sched_state.prescheduled_inames):
+                # Avoid messing up some orderings such as picking:
+                #
+                # EnterLoop(temporary.reload)
+                # CallKernel
+                # ...
+                #
+                # instead of
+                #
+                # CallKernel
+                # EnterLoop(temporary.reload)
+                # ...
+                #
+                # This serves a heuristic to catch some bad decisions early, the
+                # scheduler will not allow the first variant regardless.
+                if debug_mode:
+                    print("scheduling '%s' prohibited because we are outside "
+                          "a subkernel" % iname)
+                continue
+
             currently_accessible_inames = (
                     active_inames_set | sched_state.parallel_inames)
             if (
@@ -1063,6 +1256,10 @@ def generate_loop_schedules_internal(
                                 entered_inames=(
                                     sched_state.entered_inames
                                     | frozenset((iname,))),
+                                preschedule=(
+                                    sched_state.preschedule
+                                    if iname not in sched_state.prescheduled_inames
+                                    else sched_state.preschedule[1:]),
                                 ),
                             allow_boost=rec_allow_boost,
                             debug=debug):
@@ -1082,7 +1279,10 @@ def generate_loop_schedules_internal(
         if inp:
             raise ScheduleDebugInput(inp)
 
-    if not sched_state.active_inames and not sched_state.unscheduled_insn_ids:
+    if (
+            not sched_state.active_inames
+            and not sched_state.unscheduled_insn_ids
+            and not sched_state.preschedule):
         # if done, yield result
         debug.log_success(sched_state.schedule)
 
@@ -1138,7 +1338,8 @@ def convert_barrier_instructions_to_barriers(kernel, schedule):
             if isinstance(insn, BarrierInstruction):
                 result.append(Barrier(
                     kind=insn.kind,
-                    originating_insn_id=insn.id))
+                    originating_insn_id=insn.id,
+                    comment="Barrier inserted due to %s" % insn.id))
                 continue
 
         result.append(sched_item)
@@ -1150,7 +1351,7 @@ def convert_barrier_instructions_to_barriers(kernel, schedule):
 
 # {{{ barrier insertion/verification
 
-class DependencyRecord(Record):
+class DependencyRecord(ImmutableRecord):
     """
     .. attribute:: source
 
@@ -1175,7 +1376,7 @@ class DependencyRecord(Record):
     """
 
     def __init__(self, source, target, dep_descr, variable, var_kind):
-        Record.__init__(self,
+        ImmutableRecord.__init__(self,
                 source=source,
                 target=target,
                 dep_descr=dep_descr,
@@ -1209,8 +1410,8 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     if reverse:
         source, target = target, source
 
-    if source.id in target.no_sync_with:
-        return None
+    if source.id in kernel.get_nosync_set(target.id, var_kind):
+        return
 
     # {{{ check that a dependency exists
 
@@ -1309,6 +1510,9 @@ def get_tail_starting_at_last_barrier(schedule, kind):
         elif isinstance(sched_item, (EnterLoop, LeaveLoop)):
             pass
 
+        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+            pass
+
         else:
             raise ValueError("unexpected schedule item type '%s'"
                     % type(sched_item).__name__)
@@ -1322,7 +1526,8 @@ def insn_ids_from_schedule(schedule):
         if isinstance(sched_item, RunInstruction):
             result.append(sched_item.insn_id)
 
-        elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier)):
+        elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel,
+                                     ReturnFromKernel)):
             pass
 
         else:
@@ -1455,8 +1660,22 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
                             source=dep_src_insn_id,
                             reverse=reverse, var_kind=kind)
                     if dep:
-                        issue_barrier(dep=dep)
-                        break
+                        if verify_only:
+                            from loopy.diagnostic import MissingBarrierError
+                            raise MissingBarrierError(
+                                    "Dependency '%s' (for variable '%s') "
+                                    "requires synchronization "
+                                    "by a %s barrier (add a 'no_sync_with' "
+                                    "instruction option to state that no"
+                                    "synchronization is needed)"
+                                    % (
+                                        dep.dep_descr.format(
+                                            tgt=dep.target.id, src=dep.source.id),
+                                        dep.variable,
+                                        kind))
+                        else:
+                            issue_barrier(dep=dep)
+                            break
 
             # }}}
 
@@ -1500,7 +1719,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
                                 "Dependency '%s' (for variable '%s') "
                                 "requires synchronization "
                                 "by a %s barrier (add a 'no_sync_with' "
-                                "instruction option to state that no"
+                                "instruction option to state that no "
                                 "synchronization is needed)"
                                 % (
                                     dep.dep_descr.format(
@@ -1515,6 +1734,10 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
             result.append(sched_item)
             candidates.add(sched_item.insn_id)
 
+        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+            result.append(sched_item)
+            i += 1
+
         else:
             raise ValueError("unexpected schedule item type '%s'"
                     % type(sched_item).__name__)
@@ -1536,7 +1759,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
 
 def generate_loop_schedules(kernel, debug_args={}):
     from loopy.kernel import kernel_state
-    if kernel.state != kernel_state.PREPROCESSED:
+    if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
@@ -1547,6 +1770,18 @@ def generate_loop_schedules(kernel, debug_args={}):
 
     debug = ScheduleDebugger(**debug_args)
 
+    preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else ()
+
+    prescheduled_inames = set(
+            insn.iname
+            for insn in preschedule
+            if isinstance(insn, EnterLoop))
+
+    prescheduled_insn_ids = set(
+        insn_id
+        for item in preschedule
+        for insn_id in sched_item_to_insn_id(item))
+
     from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
     ilp_inames = set(
             iname
@@ -1573,14 +1808,22 @@ def generate_loop_schedules(kernel, debug_args={}):
             ilp_inames=ilp_inames,
             vec_inames=vec_inames,
 
+            prescheduled_inames=prescheduled_inames,
+            prescheduled_insn_ids=prescheduled_insn_ids,
+
             # time-varying part
             active_inames=(),
             entered_inames=frozenset(),
+            enclosing_subkernel_inames=(),
 
             schedule=(),
 
             unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
             scheduled_insn_ids=frozenset(),
+            within_subkernel=kernel.state != kernel_state.SCHEDULED,
+            may_schedule_global_barriers=True,
+
+            preschedule=preschedule,
 
             # ilp and vec are not parallel for the purposes of the scheduler
             parallel_inames=parallel_inames - ilp_inames - vec_inames,
@@ -1638,18 +1881,15 @@ def generate_loop_schedules(kernel, debug_args={}):
 
                 gsize, lsize = kernel.get_grid_size_upper_bounds()
 
-                if gsize or lsize:
+                if (gsize or lsize):
                     if not kernel.options.disable_global_barriers:
                         logger.info("%s: barrier insertion: global" % kernel.name)
-
                         gen_sched = insert_barriers(kernel, gen_sched,
                                 reverse=False, kind="global", verify_only=True)
 
                     logger.info("%s: barrier insertion: local" % kernel.name)
-
                     gen_sched = insert_barriers(kernel, gen_sched,
                             reverse=False, kind="local", verify_only=False)
-
                     logger.info("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
@@ -1658,7 +1898,12 @@ def generate_loop_schedules(kernel, debug_args={}):
 
                 from loopy.schedule.device_mapping import \
                         map_schedule_onto_host_or_device
-                new_kernel = map_schedule_onto_host_or_device(new_kernel)
+                if kernel.state != kernel_state.SCHEDULED:
+                    # Device mapper only gets run once.
+                    new_kernel = map_schedule_onto_host_or_device(new_kernel)
+
+                from loopy.schedule.tools import add_extra_args_to_schedule
+                new_kernel = add_extra_args_to_schedule(new_kernel)
                 yield new_kernel
 
                 debug.start()
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index ca782a3d8ca85ea6250f7c9317ca0947db28d5e8..1a0789c2f61e21e4a0371e2a73195c9771245527 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -23,14 +23,13 @@ THE SOFTWARE.
 """
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel.data import TemporaryVariable, temp_var_scope
-from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem,
-                            EnterLoop, LeaveLoop, ReturnFromKernel,
-                            RunInstruction)
-from pytools import Record, memoize_method
+from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop,
+                            ReturnFromKernel, RunInstruction)
+from loopy.schedule.tools import get_block_boundaries
 
 
 def map_schedule_onto_host_or_device(kernel):
+    # FIXME: Should be idempotent.
     from loopy.kernel import kernel_state
     assert kernel.state == kernel_state.SCHEDULED
 
@@ -53,659 +52,14 @@ def map_schedule_onto_host_or_device(kernel):
         kernel = map_schedule_onto_host_or_device_impl(
                 kernel, device_prog_name_gen)
 
-    return restore_and_save_temporaries(
-        add_extra_args_to_schedule(kernel))
-
-
-# {{{ Schedule / instruction utilities
-
-def get_block_boundaries(schedule):
-    """
-    Return a dictionary mapping indices of
-    :class:`loopy.schedule.BlockBeginItem`s to
-    :class:`loopy.schedule.BlockEndItem`s and vice versa.
-    """
-    block_bounds = {}
-    active_blocks = []
-    for idx, sched_item in enumerate(schedule):
-        if isinstance(sched_item, BeginBlockItem):
-            active_blocks.append(idx)
-        elif isinstance(sched_item, EndBlockItem):
-            start = active_blocks.pop()
-            block_bounds[start] = idx
-            block_bounds[idx] = start
-    return block_bounds
-
-
-def get_hw_inames(kernel, insn):
-    """
-    Return the inames that insn runs in and that are tagged as hardware
-    parallel.
-    """
-    from loopy.kernel.data import HardwareParallelTag
-    return set(iname for iname in kernel.insn_inames(insn)
-        if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag))
-
-
-def get_common_hw_inames(kernel, insn_ids):
-    """
-    Return the common set of hardware parallel tagged inames among
-    the list of instructions.
-    """
-    # Get the list of hardware inames in which the temporary is defined.
-    if len(insn_ids) == 0:
-        return set()
-    return set.intersection(
-        *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids))
-
-
-def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel):
-    from loopy.kernel.data import HardwareParallelTag
-    new_schedule = []
-
-    for item in kernel.schedule:
-        if isinstance(item, (EnterLoop, LeaveLoop)):
-            tag = kernel.iname_to_tag.get(item.iname)
-            if isinstance(tag, HardwareParallelTag):
-                continue
-        new_schedule.append(item)
-
-    return kernel.copy(schedule=new_schedule)
-
-# }}}
-
-
-# {{{ Use / def utilities
-
-def filter_out_subscripts(exprs):
-    """
-    Remove subscripts from expressions in `exprs`.
-    """
-    result = set()
-    from pymbolic.primitives import Subscript
-    for expr in exprs:
-        if isinstance(expr, Subscript):
-            expr = expr.aggregate
-        result.add(expr)
-    return result
-
-
-def filter_items_by_varname(pred, kernel, items):
-    """
-    Keep only the values in `items` whose variable names satisfy `pred`.
-    """
-    from pymbolic.primitives import Subscript, Variable
-    result = set()
-    for item in items:
-        base = item
-        if isinstance(base, Subscript):
-            base = base.aggregate
-        if isinstance(base, Variable):
-            base = base.name
-        if pred(kernel, base):
-            result.add(item)
-    return result
-
-
-from functools import partial
-
-filter_temporaries = partial(filter_items_by_varname,
-    lambda kernel, name: name in kernel.temporary_variables)
-
-filter_scalar_temporaries = partial(filter_items_by_varname,
-    lambda kernel, name: name in kernel.temporary_variables and
-        len(kernel.temporary_variables[name].shape) == 0)
-
-
-def get_use_set(insn, include_subscripts=True):
-    """
-    Return the use-set of the instruction, for liveness analysis.
-    """
-    result = insn.read_dependency_names()
-    if not include_subscripts:
-        result = filter_out_subscripts(result)
-    return result
-
-
-def get_def_set(insn, include_subscripts=True):
-    """
-    Return the def-set of the instruction, for liveness analysis.
-    """
-    result = insn.write_dependency_names()
-    if not include_subscripts:
-        result = filter_out_subscripts(result)
-    return result
-
-
-def get_def_and_use_lists_for_all_temporaries(kernel):
-    """
-    Return a pair `def_lists`, `use_lists` which map temporary variable
-    names to lists of instructions where they are defined or used.
-    """
-    def_lists = dict((t, []) for t in kernel.temporary_variables)
-    use_lists = dict((t, []) for t in kernel.temporary_variables)
-
-    for insn in kernel.instructions:
-        assignees = get_def_set(insn, include_subscripts=False)
-        dependencies = get_use_set(insn, include_subscripts=False)
-
-        from pymbolic.primitives import Variable
-
-        for assignee in assignees:
-            if isinstance(assignee, Variable):
-                assignee = assignee.name
-            if assignee in kernel.temporary_variables:
-                def_lists[assignee].append(insn.id)
-
-        for dep in dependencies:
-            if isinstance(dep, Variable):
-                dep = dep.name
-            if dep in kernel.temporary_variables:
-                use_lists[dep].append(insn.id)
-
-    return def_lists, use_lists
-
-
-def get_temporaries_defined_and_used_in_subrange(
-        kernel, schedule, start_idx, end_idx):
-    defs = set()
-    uses = set()
-
-    for idx in range(start_idx, end_idx + 1):
-        sched_item = schedule[idx]
-        if isinstance(sched_item, RunInstruction):
-            insn = kernel.id_to_insn[sched_item.insn_id]
-            defs.update(
-                filter_temporaries(
-                    kernel, get_def_set(insn)))
-            uses.update(
-                filter_temporaries(
-                    kernel, get_use_set(insn)))
-
-    return defs, uses
-
-# }}}
-
-
-# {{{ Liveness analysis
-
-def compute_live_temporaries(kernel, schedule):
-    """
-    Compute live-in and live-out sets for temporary variables.
-    """
-    live_in = [set() for i in range(len(schedule) + 1)]
-    live_out = [set() for i in range(len(schedule))]
-
-    id_to_insn = kernel.id_to_insn
-    block_bounds = get_block_boundaries(schedule)
-
-    # {{{ Liveness analysis implementation
-
-    def compute_subrange_liveness(start_idx, end_idx):
-        idx = end_idx
-        while start_idx <= idx:
-            sched_item = schedule[idx]
-            if isinstance(sched_item, LeaveLoop):
-                start = block_bounds[idx]
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                compute_subrange_liveness(start + 1, idx - 1)
-                prev_live_in = live_in[start].copy()
-                live_in[start] = live_out[start] = live_in[start + 1]
-                # Propagate live values through the loop.
-                if live_in[start] != prev_live_in:
-                    live_out[idx] |= live_in[start]
-                    live_in[idx] = live_out[idx]
-                    compute_subrange_liveness(start + 1, idx - 1)
-                idx = start - 1
-
-            elif isinstance(sched_item, ReturnFromKernel):
-                start = block_bounds[idx]
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                compute_subrange_liveness(start + 1, idx - 1)
-                live_in[start] = live_out[start] = live_in[start + 1]
-                idx = start - 1
-
-            elif isinstance(sched_item, RunInstruction):
-                live_out[idx] = live_in[idx + 1]
-                insn = id_to_insn[sched_item.insn_id]
-                defs = filter_scalar_temporaries(kernel,
-                    get_def_set(insn, include_subscripts=False))
-                uses = filter_temporaries(kernel,
-                    get_use_set(insn, include_subscripts=False))
-                live_in[idx] = (live_out[idx] - defs) | uses
-                idx -= 1
-
-            elif isinstance(sched_item, Barrier):
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                idx -= 1
-            else:
-                raise LoopyError("unexpected type of schedule item: %s"
-                        % type(sched_item).__name__)
-
-    # }}}
-
-    # Compute live variables
-    compute_subrange_liveness(0, len(schedule) - 1)
-    live_in = live_in[:-1]
-
-    if 0:
-        print(kernel)
-        print("Live-in values:")
-        for i, li in enumerate(live_in):
-            print("{}: {}".format(i, ", ".join(li)))
-        print("Live-out values:")
-        for i, lo in enumerate(live_out):
-            print("{}: {}".format(i, ", ".join(lo)))
-
-    # Strip off subscripts.
-    live_in = [filter_out_subscripts(li) for li in live_in]
-    live_out = [filter_out_subscripts(lo) for lo in live_out]
-
-    return live_in, live_out
-
-# }}}
-
-
-# {{{ Temporary promotion
-
-class PromotedTemporary(Record):
-    """
-    .. attribute:: name
-
-        The name of the new temporary.
-
-    .. attribute:: orig_temporary
-
-        The original temporary variable object.
-
-    .. attribute:: hw_inames
-
-        The common list of hw axes that define the original object.
-
-    .. attribute:: shape_prefix
-
-        A list of expressions, to be added in front of the shape
-        of the promoted temporary value
-    """
-
-    @memoize_method
-    def as_variable(self):
-        temporary = self.orig_temporary
-        return TemporaryVariable(
-            name=self.name,
-            dtype=temporary.dtype,
-            scope=temp_var_scope.GLOBAL,
-            shape=self.new_shape)
-
-    @property
-    def new_shape(self):
-        return self.shape_prefix + self.orig_temporary.shape
-
-
-def determine_temporaries_to_promote(kernel, temporaries, name_gen):
-    """
-    For each temporary in the passed list of temporaries, construct a
-    :class:`PromotedTemporary` which describes how the temporary should
-    get promoted into global storage.
-
-    :returns: A :class:`dict` mapping temporary names from `temporaries` to
-              :class:`PromotedTemporary` objects
-    """
-    new_temporaries = {}
-
-    def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel)
-
-    from loopy.kernel.data import LocalIndexTag
-
-    for temporary in temporaries:
-        temporary = kernel.temporary_variables[temporary]
-        if temporary.scope == temp_var_scope.GLOBAL:
-            # Nothing to be done for global temporaries (I hope)
-            continue
-
-        assert temporary.base_storage is None, \
-            "Cannot promote temporaries with base_storage to global"
-
-        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
-        # is associated with. This is used for determining the shape of the
-        # global storage needed for saving and restoring the temporary across
-        # kernel calls.
-        #
-        # TODO: Make a policy decision about which dimensions to use. Currently,
-        # the code looks at each instruction that defines or uses the temporary,
-        # and takes the common set of hw-parallel tagged inames associated with
-        # these instructions.
-        #
-        # Furthermore, in the case of local temporaries, inames that are tagged
-        # hw-local do not contribute to the global storage shape.
-        hw_inames = get_common_hw_inames(kernel,
-            def_lists[temporary.name] + use_lists[temporary.name])
-
-        # This takes advantage of the fact that g < l in the alphabet :)
-        hw_inames = sorted(hw_inames,
-            key=lambda iname: str(kernel.iname_to_tag[iname]))
-
-        # Calculate the sizes of the dimensions that get added in front for
-        # the global storage of the temporary.
-        shape_prefix = []
-
-        backing_hw_inames = []
-        for iname in hw_inames:
-            tag = kernel.iname_to_tag[iname]
-            is_local_iname = isinstance(tag, LocalIndexTag)
-            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
-                # Restrict shape to that of group inames for locals.
-                continue
-            backing_hw_inames.append(iname)
-            from loopy.isl_helpers import static_max_of_pw_aff
-            from loopy.symbolic import aff_to_expr
-            shape_prefix.append(
-                aff_to_expr(
-                    static_max_of_pw_aff(
-                        kernel.get_iname_bounds(iname).size, False)))
-
-        backing_temporary = PromotedTemporary(
-            name=name_gen(temporary.name),
-            orig_temporary=temporary,
-            shape_prefix=tuple(shape_prefix),
-            hw_inames=backing_hw_inames)
-        new_temporaries[temporary.name] = backing_temporary
-
-    return new_temporaries
-
-# }}}
-
-
-# {{{ Domain augmentation
-
-def augment_domain_for_temporary_promotion(
-        kernel, domain, promoted_temporary, mode, name_gen):
-    """
-    Add new axes to the domain corresponding to the dimensions of
-    `promoted_temporary`.
-    """
-    import islpy as isl
-
-    orig_temporary = promoted_temporary.orig_temporary
-    orig_dim = domain.dim(isl.dim_type.set)
-    dims_to_insert = len(orig_temporary.shape)
-
-    iname_to_tag = {}
-
-    # Add dimension-dependent inames.
-    dim_inames = []
-
-    domain = domain.add(isl.dim_type.set, dims_to_insert)
-    for t_idx in range(len(orig_temporary.shape)):
-        new_iname = name_gen("{name}_{mode}_dim_{dim}".
-            format(name=orig_temporary.name,
-                   mode=mode,
-                   dim=t_idx))
-        domain = domain.set_dim_name(
-            isl.dim_type.set, orig_dim + t_idx, new_iname)
-        if orig_temporary.is_local:
-            # If the temporary is has local scope, then loads / stores can be
-            # done in parallel.
-            from loopy.kernel.data import AutoFitLocalIndexTag
-            iname_to_tag[new_iname] = AutoFitLocalIndexTag()
-
-        dim_inames.append(new_iname)
-
-        # Add size information.
-        aff = isl.affs_from_space(domain.space)
-        domain &= aff[0].le_set(aff[new_iname])
-        size = orig_temporary.shape[t_idx]
-        from loopy.symbolic import aff_from_expr
-        domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size))
-
-    hw_inames = []
-
-    # Add hardware inames duplicates.
-    for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
-        new_iname = name_gen("{name}_{mode}_hw_dim_{dim}".
-            format(name=orig_temporary.name,
-                   mode=mode,
-                   dim=t_idx))
-        hw_inames.append(new_iname)
-        iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname]
-
-    from loopy.isl_helpers import duplicate_axes
-    domain = duplicate_axes(
-        domain, promoted_temporary.hw_inames, hw_inames)
-
-    # The operations on the domain above return a Set object, but the
-    # underlying domain should be expressible as a single BasicSet.
-    domain_list = domain.get_basic_set_list()
-    assert domain_list.n_basic_set() == 1
-    domain = domain_list.get_basic_set(0)
-    return domain, hw_inames, dim_inames, iname_to_tag
-
-# }}}
-
-
-def restore_and_save_temporaries(kernel):
-    """
-    Add code that loads / spills the temporaries in the kernel which are
-    live across sub-kernel calls.
-    """
-    # Compute live temporaries.
-    live_in, live_out = compute_live_temporaries(kernel, kernel.schedule)
-
-    # Create kernel variables based on live temporaries.
-    inter_kernel_temporaries = set()
-
-    call_count = 0
-    for idx, sched_item in enumerate(kernel.schedule):
-        if isinstance(sched_item, CallKernel):
-            inter_kernel_temporaries |= filter_out_subscripts(live_in[idx])
-            call_count += 1
-
-    if call_count == 1:
-        # Single call corresponds to a kernel which has not been split -
-        # no need for restores / spills of temporaries.
-        return kernel
-
-    name_gen = kernel.get_var_name_generator()
-    new_temporaries = determine_temporaries_to_promote(
-        kernel, inter_kernel_temporaries, name_gen)
-
-    # {{{ Insert loads and spills of new temporaries
-
-    new_schedule = []
-    new_instructions = []
-    new_iname_to_tag = {}
-
-    idx = 0
-    schedule = kernel.schedule
-    while idx < len(schedule):
-        sched_item = schedule[idx]
-
-        if not isinstance(sched_item, CallKernel):
-            new_schedule.append(sched_item)
-            idx += 1
-            continue
-
-        subkernel_prolog = []
-        subkernel_epilog = []
-        subkernel_schedule = []
-
-        start_idx = idx
-        idx += 1
-        while not isinstance(schedule[idx], ReturnFromKernel):
-            subkernel_schedule.append(schedule[idx])
-            idx += 1
-
-        subkernel_defs, subkernel_uses = \
-            get_temporaries_defined_and_used_in_subrange(
-                kernel, schedule, start_idx + 1, idx - 1)
-
-        from loopy.kernel.data import temp_var_scope
-        # Filter out temporaries that are global.
-        subkernel_globals = set(
-            tval for tval in subkernel_defs | subkernel_uses
-            if kernel.temporary_variables[tval].scope == temp_var_scope.GLOBAL)
-
-        tvals_to_spill = (subkernel_defs - subkernel_globals) & live_out[idx]
-        # Need to load tvals_to_spill, to avoid overwriting entries that the
-        # code doesn't touch when doing the spill.
-        tvals_to_load = ((subkernel_uses - subkernel_globals)
-            | tvals_to_spill) & live_in[start_idx]
-
-        # Add new arguments.
-        sched_item = sched_item.copy(
-            extra_args=sched_item.extra_args
-            + sorted(new_temporaries[tv].name
-                     for tv in tvals_to_load | tvals_to_spill))
-
-        # {{{ Add all the loads and spills.
-
-        def insert_loads_or_spills(tvals, mode):
-            assert mode in ["load", "spill"]
-            local_temporaries = set()
-
-            code_block = \
-                subkernel_prolog if mode == "load" else subkernel_epilog
-
-            new_kernel = kernel
-
-            for tval in tvals:
-                from loopy.kernel.tools import DomainChanger
-                tval_hw_inames = new_temporaries[tval].hw_inames
-                dchg = DomainChanger(new_kernel,
-                    frozenset(sched_item.extra_inames + tval_hw_inames))
-                domain = dchg.domain
-
-                domain, hw_inames, dim_inames, itt = \
-                    augment_domain_for_temporary_promotion(
-                        new_kernel, domain, new_temporaries[tval], mode,
-                        name_gen)
-                new_iname_to_tag.update(itt)
-
-                new_kernel = dchg.get_kernel_with(domain)
-
-                # Add the load / spill instruction.
-                insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode))
-
-                def subscript_or_var(agg, subscript):
-                    from pymbolic.primitives import Subscript, Variable
-                    if len(subscript) == 0:
-                        return Variable(agg)
-                    else:
-                        return Subscript(
-                            Variable(agg),
-                            tuple(map(Variable, subscript)))
-
-                args = (
-                    subscript_or_var(
-                        tval, dim_inames),
-                    subscript_or_var(
-                        new_temporaries[tval].name, hw_inames + dim_inames))
-
-                if mode == "spill":
-                    args = reversed(args)
-
-                from loopy.kernel.data import Assignment
-                new_insn = Assignment(*args, id=insn_id,
-                    within_inames=frozenset(hw_inames + dim_inames),
-                    within_inames_is_final=True)
-
-                new_instructions.append(new_insn)
-
-                loop_begin = [EnterLoop(iname=iname) for iname in dim_inames]
-                loop_end = list(reversed([
-                    LeaveLoop(iname=iname) for iname in dim_inames]))
-                code_block.extend(
-                    loop_begin +
-                    [RunInstruction(insn_id=insn_id)] +
-                    loop_end)
-                if new_temporaries[tval].orig_temporary.is_local:
-                    local_temporaries.add(new_temporaries[tval].name)
-
-            # After loading / before spilling local temporaries, we need to
-            # insert a barrier.
-            if local_temporaries:
-                if mode == "load":
-                    subkernel_prolog.append(
-                        Barrier(kind="local",
-                                comment="for loads of {0}".format(
-                                    ", ".join(sorted(local_temporaries)))))
-                else:
-                    subkernel_epilog.insert(0,
-                        Barrier(kind="local",
-                                comment="for spills of {0}".format(
-                                    ", ".join(sorted(local_temporaries)))))
-            return new_kernel
-
-        kernel = insert_loads_or_spills(tvals_to_load, "load")
-        kernel = insert_loads_or_spills(tvals_to_spill, "spill")
-
-        # }}}
-
-        new_schedule.extend(
-            [sched_item] +
-            subkernel_prolog +
-            subkernel_schedule +
-            subkernel_epilog +
-            # ReturnFromKernel
-            [schedule[idx]])
-
-        # ReturnFromKernel
-        idx += 1
-
-    # }}}
-
-    new_iname_to_tag.update(kernel.iname_to_tag)
-    updated_temporary_variables = dict(
-        (t.name, t.as_variable()) for t in new_temporaries.values())
-    updated_temporary_variables.update(kernel.temporary_variables)
-
-    kernel = kernel.copy(
-        iname_to_tag=new_iname_to_tag,
-        temporary_variables=updated_temporary_variables,
-        instructions=kernel.instructions + new_instructions,
-        schedule=new_schedule
-        )
-
-    from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel)
-
-    # Once assign_automatic_axes() does its job, loops in the schedule
-    # for newly hardware-tagged inames are no longer necessary (and in
-    # fact illegal), so remove them.
-    kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel)
-
     return kernel
 
 
-def add_extra_args_to_schedule(kernel):
-    """
-    Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel`
-    instructions in the schedule with global temporaries.
-    """
-    new_schedule = []
-
-    block_bounds = get_block_boundaries(kernel.schedule)
-    for idx, sched_item in enumerate(kernel.schedule):
-        if isinstance(sched_item, CallKernel):
-            defs, uses = get_temporaries_defined_and_used_in_subrange(
-                   kernel, kernel.schedule, idx + 1, block_bounds[idx] - 1)
-            # Filter out temporaries that are global.
-            extra_args = (tv for tv in defs | uses if
-                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
-                and
-                kernel.temporary_variables[tv].initializer is None)
-            new_schedule.append(sched_item.copy(extra_args=sorted(extra_args)))
-        else:
-            new_schedule.append(sched_item)
-
-    return kernel.copy(schedule=new_schedule)
-
-
 def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
     schedule = kernel.schedule
     loop_bounds = get_block_boundaries(schedule)
 
-    # {{{ Inner mapper function
+    # {{{ inner mapper function
 
     dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[])
     dummy_return = ReturnFromKernel(kernel_name="")
@@ -760,6 +114,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
                             [dummy_call.copy()] +
                             current_chunk +
                             [dummy_return.copy()])
+                    new_schedule.append(sched_item)
                     current_chunk = []
                 else:
                     current_chunk.append(sched_item)
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de677e72708be844a5276b3d40ace8b1dad9da0
--- /dev/null
+++ b/loopy/schedule/tools.py
@@ -0,0 +1,191 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.kernel.data import temp_var_scope
+from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem,
+                            RunInstruction, Barrier)
+
+from pytools import memoize_method
+
+
+# {{{ block boundary finder
+
+def get_block_boundaries(schedule):
+    """
+    Return a dictionary mapping indices of
+    :class:`loopy.schedule.BlockBeginItem`s to
+    :class:`loopy.schedule.BlockEndItem`s and vice versa.
+    """
+    block_bounds = {}
+    active_blocks = []
+    for idx, sched_item in enumerate(schedule):
+        if isinstance(sched_item, BeginBlockItem):
+            active_blocks.append(idx)
+        elif isinstance(sched_item, EndBlockItem):
+            start = active_blocks.pop()
+            block_bounds[start] = idx
+            block_bounds[idx] = start
+    return block_bounds
+
+# }}}
+
+
+# {{{ instruction query utility
+
+class InstructionQuery(object):
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        block_bounds = get_block_boundaries(kernel.schedule)
+        subkernel_slices = {}
+        from six import iteritems
+        for start, end in iteritems(block_bounds):
+            sched_item = kernel.schedule[start]
+            if isinstance(sched_item, CallKernel):
+                subkernel_slices[sched_item.kernel_name] = slice(start, end + 1)
+        self.subkernel_slices = subkernel_slices
+
+    @memoize_method
+    def subkernels(self):
+        return frozenset(self.subkernel_slices.keys())
+
+    @memoize_method
+    def insns_reading_or_writing(self, var):
+        return frozenset(insn.id for insn in self.kernel.instructions
+            if var in insn.read_dependency_names()
+                or var in insn.assignee_var_names())
+
+    @memoize_method
+    def insns_in_subkernel(self, subkernel):
+        return frozenset(sched_item.insn_id for sched_item
+            in self.kernel.schedule[self.subkernel_slices[subkernel]]
+            if isinstance(sched_item, RunInstruction))
+
+    @memoize_method
+    def temporaries_read_in_subkernel(self, subkernel):
+        return frozenset(
+            var
+            for insn in self.insns_in_subkernel(subkernel)
+            for var in self.kernel.id_to_insn[insn].read_dependency_names()
+            if var in self.kernel.temporary_variables)
+
+    @memoize_method
+    def temporaries_written_in_subkernel(self, subkernel):
+        return frozenset(
+            var
+            for insn in self.insns_in_subkernel(subkernel)
+            for var in self.kernel.id_to_insn[insn].assignee_var_names()
+            if var in self.kernel.temporary_variables)
+
+    @memoize_method
+    def temporaries_read_or_written_in_subkernel(self, subkernel):
+        return (
+            self.temporaries_read_in_subkernel(subkernel) |
+            self.temporaries_written_in_subkernel(subkernel))
+
+    @memoize_method
+    def inames_in_subkernel(self, subkernel):
+        subkernel_start = self.subkernel_slices[subkernel].start
+        return frozenset(self.kernel.schedule[subkernel_start].extra_inames)
+
+    @memoize_method
+    def pre_and_post_barriers(self, subkernel):
+        subkernel_start = self.subkernel_slices[subkernel].start
+        subkernel_end = self.subkernel_slices[subkernel].stop
+
+        def is_global_barrier(item):
+            return isinstance(item, Barrier) and item.kind == "global"
+
+        try:
+            pre_barrier = next(item for item in
+                    self.kernel.schedule[subkernel_start::-1]
+                    if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            pre_barrier = None
+
+        try:
+            post_barrier = next(item for item in
+                    self.kernel.schedule[subkernel_end:]
+                    if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            post_barrier = None
+
+        return (pre_barrier, post_barrier)
+
+    @memoize_method
+    def hw_inames(self, insn_id):
+        """
+        Return the inames that insn runs in and that are tagged as hardware
+        parallel.
+        """
+        from loopy.kernel.data import HardwareParallelTag
+        return set(iname for iname in self.kernel.insn_inames(insn_id)
+                   if isinstance(self.kernel.iname_to_tag.get(iname),
+                                 HardwareParallelTag))
+
+    @memoize_method
+    def common_hw_inames(self, insn_ids):
+        """
+        Return the common set of hardware parallel tagged inames among
+        the list of instructions.
+        """
+        # Get the list of hardware inames in which the temporary is defined.
+        if len(insn_ids) == 0:
+            return set()
+        return set.intersection(*(self.hw_inames(id) for id in insn_ids))
+
+# }}}
+
+
+# {{{ add extra args to schedule
+
+def add_extra_args_to_schedule(kernel):
+    """
+    Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel`
+    instructions in the schedule with global temporaries.
+    """
+    new_schedule = []
+
+    insn_query = InstructionQuery(kernel)
+
+    for sched_item in kernel.schedule:
+        if isinstance(sched_item, CallKernel):
+            subrange_temporaries = (insn_query
+                .temporaries_read_or_written_in_subkernel(sched_item.kernel_name))
+            more_args = set(tv
+                for tv in subrange_temporaries
+                if
+                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
+                and
+                kernel.temporary_variables[tv].initializer is None
+                and
+                tv not in sched_item.extra_args)
+            new_schedule.append(sched_item.copy(
+                extra_args=sched_item.extra_args + sorted(more_args)))
+        else:
+            new_schedule.append(sched_item)
+
+    return kernel.copy(schedule=new_schedule)
+
+# }}}
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 47abfe53a4bfe8598cd09425b5baa81f13525c37..2ec5eb0d4d5e32dbd9eb201ab718078a6b36f7d8 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import six
 
 import loopy as lp
+import numpy as np
 import warnings
 from islpy import dim_type
 import islpy as isl
@@ -39,13 +40,13 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
-.. autofunction:: get_op_poly
+.. autoclass:: ToCountMap
+.. autoclass:: Op
+.. autoclass:: MemAccess
 
-.. autofunction:: get_gmem_access_poly
-
-.. autofunction:: sum_mem_access_to_bytes
-
-.. autofunction:: get_synchronization_poly
+.. autofunction:: get_op_map
+.. autofunction:: get_mem_access_map
+.. autofunction:: get_synchronization_map
 
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
@@ -55,18 +56,27 @@ __doc__ = """
 
 # {{{ ToCountMap
 
-class ToCountMap:
-    """Maps any type of key to an arithmetic type."""
+class ToCountMap(object):
+    """Maps any type of key to an arithmetic type.
+
+    .. automethod:: filter_by
+    .. automethod:: filter_by_func
+    .. automethod:: group_by
+    .. automethod:: to_bytes
+    .. automethod:: sum
+    .. automethod:: eval_and_sum
+
+    """
 
     def __init__(self, init_dict=None):
         if init_dict is None:
             init_dict = {}
-        self.dict = init_dict
+        self.count_map = init_dict
 
     def __add__(self, other):
-        result = self.dict.copy()
-        for k, v in six.iteritems(other.dict):
-            result[k] = self.dict.get(k, 0) + v
+        result = self.count_map.copy()
+        for k, v in six.iteritems(other.count_map):
+            result[k] = self.count_map.get(k, 0) + v
         return ToCountMap(result)
 
     def __radd__(self, other):
@@ -80,8 +90,8 @@ class ToCountMap:
     def __mul__(self, other):
         if isinstance(other, isl.PwQPolynomial):
             return ToCountMap(dict(
-                (index, self.dict[index]*other)
-                for index in self.dict.keys()))
+                (index, self.count_map[index]*other)
+                for index in self.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
                                 "ToCountMap by {0} {1}."
@@ -91,12 +101,262 @@ class ToCountMap:
 
     def __getitem__(self, index):
         try:
-            return self.dict[index]
+            return self.count_map[index]
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
+    def __setitem__(self, index, value):
+        self.count_map[index] = value
+
     def __repr__(self):
-        return repr(self.dict)
+        return repr(self.count_map)
+
+    def __len__(self):
+        return len(self.count_map)
+
+    def items(self):
+        return self.count_map.items()
+
+    def keys(self):
+        return self.count_map.keys()
+
+    def pop(self, item):
+        return self.count_map.pop(item)
+
+    def copy(self):
+        return ToCountMap(dict(self.count_map))
+
+    def filter_by(self, **kwargs):
+        """Remove items without specified key fields.
+
+        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
+                             the :class:`ToCountMap`, each given a list of
+                             allowable values for that key field.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the original :class:`ToCountMap` that match the field values
+                 passed.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        from loopy.types import to_loopy_type
+        if 'dtype' in kwargs.keys():
+            kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
+
+        # for each item in self.count_map
+        for self_key, self_val in self.items():
+            try:
+                # check to see if key attribute values match all filters
+                for arg_field, allowable_vals in kwargs.items():
+                    attr_val = getattr(self_key, arg_field)
+                    # see if the value is in the filter list
+                    if attr_val not in allowable_vals:
+                        break
+                else:  # loop terminated without break or error
+                    result_map[self_key] = self_val
+            except(AttributeError):
+                # the field passed is not a field of this key
+                continue
+
+        return result_map
+
+    def filter_by_func(self, func):
+        """Keep items that pass a test.
+
+        :parameter func: A function that takes a map key a parameter and
+                         returns a :class:`bool`.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the original :class:`ToCountMap` for which func(key) is true.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            def filter_func(key):
+                return key.stride > 1 and key.stride <= 4:
+
+            filtered_map = mem_map.filter_by_func(filter_func)
+            tot = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # for each item in self.count_map, call func on the key
+        for self_key, self_val in self.items():
+            if func(self_key):
+                result_map[self_key] = self_val
+
+        return result_map
+
+    def group_by(self, *args):
+        """Group map items together, distinguishing by only the key fields
+           passed in args.
+
+        :parameter \*args: Zero or more :class:`str` fields of map keys.
+
+        :return: A :class:`ToCountMap` containing the same total counts
+                 grouped together by new keys that only contain the fields
+                 specified in the arguments passed.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = get_mem_access_map(knl)
+            grouped_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+            f32_global_ld = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='load')
+                                       ].eval_with_dict(params)
+            f32_global_st = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='store')
+                                       ].eval_with_dict(params)
+            f32_local_ld = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='load')
+                                      ].eval_with_dict(params)
+            f32_local_st = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='store')
+                                      ].eval_with_dict(params)
+
+            op_map = get_op_map(knl)
+            ops_dtype = op_map.group_by('dtype')
+
+            f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params)
+            f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
+            i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # make sure all item keys have same type
+        if self.count_map:
+            key_type = type(list(self.keys())[0])
+            if not all(isinstance(x, key_type) for x in self.keys()):
+                raise ValueError("ToCountMap: group_by() function may only "
+                                 "be used on ToCountMaps with uniform keys")
+        else:
+            return result_map
+
+        # for each item in self.count_map
+        for self_key, self_val in self.items():
+            new_key = key_type()
+
+            # set all specified fields
+            for field in args:
+                setattr(new_key, field, getattr(self_key, field))
+
+            if new_key in result_map.keys():
+                result_map[new_key] += self_val
+            else:
+                result_map[new_key] = self_val
+
+        return result_map
+
+    def to_bytes(self):
+        """Convert counts to bytes using data type in map key.
+
+        :return: A :class:`ToCountMap` mapping each original key to a
+                 :class:`islpy.PwQPolynomial` with counts in bytes rather than
+                 instances.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            bytes_map = get_mem_access_map(knl).to_bytes()
+            params = {'n': 512, 'm': 256, 'l': 128}
+
+            s1_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['load']).eval_and_sum(params)
+            s2_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['load']).eval_and_sum(params)
+            s1_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['store']).eval_and_sum(params)
+            s2_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['store']).eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result = self.copy()
+
+        for key, val in self.items():
+            bytes_processed = int(key.dtype.itemsize) * val
+            result[key] = bytes_processed
+
+        return result
+
+
+    def sum(self):
+        """Add all counts in ToCountMap.
+
+        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
+
+        """
+        total = isl.PwQPolynomial('{ 0 }')
+        for k, v in self.items():
+            if not isinstance(v, isl.PwQPolynomial):
+                raise ValueError("ToCountMap: sum() encountered type {0} but "
+                                 "may only be used on PwQPolynomials."
+                                 .format(type(v)))
+            total += v
+        return total
+
+
+    def eval_and_sum(self, params):
+        """Add all counts in :class:`ToCountMap` and evaluate with provided
+        parameter dict.
+
+        :return: An :class:`int` containing the sum of all counts in the
+                 :class:`ToCountMap` evaluated with the parameters provided.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+        return self.sum().eval_with_dict(params)
 
 # }}}
 
@@ -108,13 +368,150 @@ def stringify_stats_mapping(m):
     return result
 
 
+class Op(object):
+    """An arithmetic operation.
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type operated on.
+
+    .. attribute:: name
+
+       A :class:`str` that specifies the kind of arithmetic operation as
+       *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
+    """
+
+    def __init__(self, dtype=None, name=None):
+        self.name = name
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
+
+    def __eq__(self, other):
+        return isinstance(other, Op) and (
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.name is None or other.name is None or
+                 self.name == other.name))
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __str__(self):
+        if self.dtype is None:
+            dtype = 'None'
+        else:
+            dtype = str(self.dtype)
+        if self.name is None:
+            name = 'None'
+        else:
+            name = self.name
+        return "Op("+dtype+", "+name+")"
+
+
+class MemAccess(object):
+    """A memory access.
+
+    .. attribute:: mtype
+
+       A :class:`str` that specifies the memory type accessed as **global**
+       or **local**
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type accessed.
+
+    .. attribute:: stride
+
+       An :class:`int` that specifies stride of the memory access. A stride of 0
+       indicates a uniform access (i.e. all threads access the same item).
+
+    .. attribute:: direction
+
+       A :class:`str` that specifies the direction of memory access as
+       **load** or **store**.
+
+    .. attribute:: variable
+
+       A :class:`str` that specifies the variable name of the data
+       accessed.
+
+    """
+
+    def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
+        self.mtype = mtype
+        self.stride = stride
+        self.direction = direction
+        self.variable = variable
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
+
+        #TODO currently giving all lmem access stride=None
+        if (mtype == 'local') and (stride is not None):
+            raise NotImplementedError("MemAccess: stride must be None when "
+                                      "mtype is 'local'")
+
+        #TODO currently giving all lmem access variable=None
+        if (mtype == 'local') and (variable is not None):
+            raise NotImplementedError("MemAccess: variable must be None when "
+                                      "mtype is 'local'")
+
+    def __eq__(self, other):
+        return isinstance(other, MemAccess) and (
+                (self.mtype is None or other.mtype is None or
+                 self.mtype == other.mtype) and
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.stride is None or other.stride is None or
+                 self.stride == other.stride) and
+                (self.direction is None or other.direction is None or
+                 self.direction == other.direction) and
+                (self.variable is None or other.variable is None or
+                 self.variable == other.variable))
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __str__(self):
+        if self.mtype is None:
+            mtype = 'None'
+        else:
+            mtype = self.mtype
+        if self.dtype is None:
+            dtype = 'None'
+        else:
+            dtype = str(self.dtype)
+        if self.stride is None:
+            stride = 'None'
+        else:
+            stride = str(self.stride)
+        if self.direction is None:
+            direction = 'None'
+        else:
+            direction = self.direction
+        if self.variable is None:
+            variable = 'None'
+        else:
+            variable = self.variable
+        return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \
+               +variable+")"
+
+
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
-        from loopy.expression import TypeInferenceMapper
+        from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
     def combine(self, values):
@@ -126,41 +523,33 @@ class ExpressionOpCounter(CombineMapper):
     map_tagged_variable = map_constant
     map_variable = map_constant
 
-    #def map_wildcard(self, expr):
-    #    return 0,0
-
-    #def map_function_symbol(self, expr):
-    #    return 0,0
-
     def map_call(self, expr):
         return ToCountMap(
-                    {(self.type_inf(expr), 'func:'+str(expr.function)): 1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
-    # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
-
-    def map_subscript(self, expr):  # implemented in CombineMapper
+    def map_subscript(self, expr):
         return self.rec(expr.index)
 
-    # def map_lookup(self, expr):  # implemented in CombineMapper
-
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
-                    {(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='add'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
+        return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({(self.type_inf(expr), 'mul'): -1})
+                   ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({(self.type_inf(expr), 'div'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -168,54 +557,47 @@ class ExpressionOpCounter(CombineMapper):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        return ToCountMap(
-                        {(self.type_inf(expr), 'bw'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'):
+                           len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
     map_bitwise_and = map_bitwise_or
 
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_logical_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_logical_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_if_branches_ops", 
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_ifpos_branches_ops",
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap(
-                        {(self.type_inf(expr), 'maxmin'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'):
+                           len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
 
@@ -225,11 +607,13 @@ class ExpressionOpCounter(CombineMapper):
                                   "map_common_subexpression not implemented.")
 
     def map_substitution(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered substitution, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "substitution, "
                                   "map_substitution not implemented.")
 
     def map_derivative(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered derivative, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "derivative, "
                                   "map_derivative not implemented.")
 
     def map_slice(self, expr):
@@ -239,13 +623,90 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
+# {{{ LocalSubscriptCounter
+
+class LocalSubscriptCounter(CombineMapper):
+
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.type_inference import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    map_tagged_variable = map_constant
+    map_variable = map_constant
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_subscript(self, expr):
+        sub_map = ToCountMap()
+        name = expr.aggregate.name  # name of array
+        if name in self.knl.temporary_variables:
+            array = self.knl.temporary_variables[name]
+            if array.is_local:
+                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
+        return sub_map + self.rec(expr.index)
+            
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    def map_if(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_common_subexpression(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "common_subexpression, "
+                                  "map_common_subexpression not implemented.")
+
+    def map_substitution(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "substitution, "
+                                  "map_substitution not implemented.")
+
+    def map_derivative(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "derivative, "
+                                  "map_derivative not implemented.")
+
+    def map_slice(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
+                                  "map_slice not implemented.")
+
+# }}}
+
+
 # {{{ GlobalSubscriptCounter
 
 class GlobalSubscriptCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
-        from loopy.expression import TypeInferenceMapper
+        from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
     def combine(self, values):
@@ -278,33 +739,52 @@ class GlobalSubscriptCounter(CombineMapper):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
-        local_id0 = None
+
+        # find min tag axis
+        import sys
+        min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
-            # find local id0
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                if tag.axis == 0:
-                    local_id0 = iname
-                    break  # there will be only one local_id0
+                if tag.axis < min_tag_axis:
+                    min_tag_axis = tag.axis
 
         if not local_id_found:
             # count as uniform access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'uniform'): 1}
-                    ) + self.rec(expr.index)
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr), stride=0,
+                                         variable=name): 1}
+                             ) + self.rec(expr.index)
+
+        if min_tag_axis != 0:
+            warn_with_kernel(self.knl, "unknown_gmem_stride",
+                             "GlobalSubscriptCounter: Memory access minimum "
+                             "tag axis %d != 0, stride unknown, using "
+                             "sys.maxsize." % (min_tag_axis))
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr),
+                                         stride=sys.maxsize, variable=name): 1}
+                             ) + self.rec(expr.index)
+
+        # get local_id associated with minimum tag axis
+        min_lid = None
+        for iname in my_inames:
+            tag = self.knl.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                if tag.axis == min_tag_axis:
+                    min_lid = iname
+                    break  # there will be only one min local_id
 
-        if local_id0 is None:
-            # only non-zero local id(s) found, assume non-consecutive access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'nonconsecutive'): 1}
-                    ) + self.rec(expr.index)
+        # found local_id associated with minimum tag axis
 
-        # check coefficient of local_id0 for each axis
+        total_stride = 0
+        # check coefficient of min_lid for each axis
         from loopy.symbolic import CoefficientCollector
+        from loopy.kernel.array import FixedStrideArrayDimTag
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
 
@@ -312,36 +792,22 @@ class GlobalSubscriptCounter(CombineMapper):
             coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx))
             # check if he contains the lid 0 guy
             try:
-                coeff_id0 = coeffs[Variable(local_id0)]
+                coeff_min_lid = coeffs[Variable(min_lid)]
             except KeyError:
-                # does not contain local_id0
+                # does not contain min_lid
                 continue
-
-            if coeff_id0 != 1:
-                # non-consecutive access
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # coefficient is 1, now determine if stride is 1
-            from loopy.kernel.array import FixedStrideArrayDimTag
+            # found coefficient of min_lid
+            # now determine stride
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
                 continue
 
-            if stride != 1:
-                # non-consecutive
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
+            total_stride += stride*coeff_min_lid
 
-            # else, stride == 1, continue since another idx could contain id0
-
-        # loop finished without returning, stride==1 for every instance of local_id0
-        return ToCountMap(
-                {(self.type_inf(expr), 'consecutive'): 1}
-                ) + self.rec(expr.index)
+        return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
+                                     stride=total_stride, variable=name): 1}
+                         ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -351,48 +817,19 @@ class GlobalSubscriptCounter(CombineMapper):
 
     map_product = map_sum
 
-    def map_quotient(self, expr, *args):
-        return self.rec(expr.numerator) + self.rec(expr.denominator)
-
-    map_floor_div = map_quotient
-    map_remainder = map_quotient
-
-    def map_power(self, expr):
-        return self.rec(expr.base) + self.rec(expr.exponent)
-
-    def map_left_shift(self, expr):
-        return self.rec(expr.shiftee)+self.rec(expr.shift)
-
-    map_right_shift = map_left_shift
-
-    def map_bitwise_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_bitwise_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_bitwise_xor = map_bitwise_or
-    map_bitwise_and = map_bitwise_or
-
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    map_logical_not = map_bitwise_not
-    map_logical_or = map_bitwise_or
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_if_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
-
-    map_min = map_bitwise_or
-    map_max = map_min
+        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("GlobalSubscriptCounter encountered "
@@ -524,7 +961,8 @@ def count(kernel, set):
 
             # {{{ rebuild check domain
 
-            zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space))
+            zero = isl.Aff.zero_on_domain(
+                        isl.LocalSpace.from_space(bset.space))
             iname = isl.PwAff.from_aff(
                     zero.set_coefficient_val(isl.dim_type.in_, i, 1))
             dmin_matched = dmin.insert_dims(
@@ -584,31 +1022,44 @@ def get_op_poly(knl, numpy_types=True):
 
     """Count the number of operations in a loopy kernel.
 
+    get_op_poly is deprecated. Use get_op_map instead.
+
+    """
+    warn_with_kernel(knl, "depricated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
+
+def get_op_map(knl, numpy_types=True):
+
+    """Count the number of operations in a loopy kernel.
+
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **)**
-             **:** :class:`islpy.PwQPolynomial` **}**.
+    :parameter numpy_types: A :class:`bool` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:`loopy.LoopyType`.
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
+    :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
+             :class:`islpy.PwQPolynomial` **}**.
 
-             - The string specifies the operation type as
-               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+             - The :class:`Op` specifies the characteristics of the arithmetic
+               operation.
 
              - The :class:`islpy.PwQPolynomial` holds the number of operations of
                the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *parameter inames*).
+               :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        poly = get_op_poly(knl)
+        op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+        f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params)
+        f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -618,88 +1069,128 @@ def get_op_poly(knl, numpy_types=True):
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    op_poly = ToCountMap()
+    op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         # how many times is this instruction executed?
         # check domain size:
         insn_inames = knl.insn_inames(insn)
         inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(
+                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_poly = op_poly + ops*count(knl, domain)
-    result = op_poly.dict
+        op_map = op_map + ops*count(knl, domain)
 
     if numpy_types:
-        result = dict(
-                ((dtype.numpy_dtype, kind), count)
-                for (dtype, kind), count in six.iteritems(result))
+        op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+                                 count)
+                for op, count in six.iteritems(op_map.count_map))
 
-    return result
-# }}}
+    return op_map
 
 
-def sum_ops_to_dtypes(op_poly_dict):
-    result = {}
-    for (dtype, kind), v in op_poly_dict.items():
-        new_key = dtype
-        if new_key in result:
-            result[new_key] += v
-        else:
-            result[new_key] = v
+#TODO test deprecated functions?
+def get_lmem_access_poly(knl):
+    """Count the number of local memory accesses in a loopy kernel.
 
-    return result
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
+    """
+    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
+
+
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 
 # {{{ get_gmem_access_poly
-def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscripts
 
+def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
-                    counted.
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **,**
-             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.
+    """
+    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+# }}}
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
 
-             - The first string in the map key specifies the global memory
-               access type as
-               *consecutive*, *nonconsecutive*, or *uniform*.
+def get_mem_access_map(knl, numpy_types=True):
+    """Count the number of memory accesses in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+                    counted.
 
-             - The second string in the map key specifies the global memory
-               access type as a
-               *load*, or a *store*.
+    :parameter numpy_types: A :class:`bool` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:`loopy.LoopyType`.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
-               with the characteristics specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
+    :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
+             :class:`islpy.PwQPolynomial` **}**.
+
+             - The :class:`MemAccess` specifies the characteristics of the
+               memory access.
+
+             - The :class:`islpy.PwQPolynomial` holds the number of memory
+               accesses with the characteristics specified in the key (in terms
+               of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        subscript_map = get_gmem_access_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-
-        f32_uncoalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'nonconsecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_store = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'store')
-                            ].eval_with_dict(params)
+        mem_map = get_mem_access_map(knl)
+
+        f32_s1_g_ld_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='x')
+                               ].eval_with_dict(params)
+        f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='x')
+                               ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
     """
-
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
     class CacheHolder(object):
@@ -712,7 +1203,8 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
         if uniform:
             from loopy.kernel.data import LocalIndexTag
             insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+                           isinstance(
+                           knl.iname_to_tag.get(iname), LocalIndexTag)]
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(
                                 insn_inames, [dim_type.set]))
@@ -721,82 +1213,82 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_poly = ToCountMap()
-    subscript_counter = GlobalSubscriptCounter(knl)
+    subs_map = ToCountMap()
+    subs_counter_g = GlobalSubscriptCounter(knl)
+    subs_counter_l = LocalSubscriptCounter(knl)
+
     for insn in knl.instructions:
-        # count subscripts, distinguishing loads and stores
-        subs_expr = subscript_counter(insn.expression)
-        subs_expr = ToCountMap(dict(
-            (key + ("load",), val)
-            for key, val in six.iteritems(subs_expr.dict)))
-        subs_assignee = subscript_counter(insn.assignee)
-        subs_assignee = ToCountMap(dict(
-            (key + ("store",), val)
-            for key, val in six.iteritems(subs_assignee.dict)))
+        # count subscripts
+        subs_expr = subs_counter_g(insn.expression) \
+                    + subs_counter_l(insn.expression)
+
+        # distinguish loads and stores
+        for key in subs_expr.count_map:
+            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                stride=key.stride, direction='load',
+                                variable=key.variable)
+                     ] = subs_expr.pop(key)
+
+        subs_assignee_g = subs_counter_g(insn.assignee)
+        for key in subs_assignee_g.count_map:
+            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                      stride=key.stride,
+                                      direction='store',
+                                      variable=key.variable)
+                           ] = subs_assignee_g.pop(key)
+        # for now, don't count writes to local mem
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.dict:
-            poly = ToCountMap({key: subs_expr.dict[key]})
-            if key[1] == "uniform":
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+        for key in subs_expr.count_map:
+            map = ToCountMap({key: subs_expr[key]})
+            if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-        for key in subs_assignee.dict:
-            poly = ToCountMap({key: subs_assignee.dict[key]})
-            if key[1] == "uniform":
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+                #currently not counting stride of local mem access
+
+        for key in subs_assignee_g.count_map:
+            map = ToCountMap({key: subs_assignee_g[key]})
+            if isinstance(key.stride, int) and key.stride == 0:
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-
-    result = subs_poly.dict
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+            # for now, don't count writes to local mem
 
     if numpy_types:
-        result = dict(
-                ((dtype.numpy_dtype, kind, direction), count)
-                for (dtype, kind, direction), count in six.iteritems(result))
-
-    return result
+        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+                                             dtype=mem_access.dtype.numpy_dtype,
+                                             stride=mem_access.stride,
+                                             direction=mem_access.direction,
+                                             variable=mem_access.variable)
+                                   , count)
+                      for mem_access, count in six.iteritems(subs_map.count_map))
 
-
-def get_DRAM_access_poly(knl):
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead",
-            DeprecationWarning, stacklevel=2)
-    return get_gmem_access_poly(knl)
-
-# }}}
+    return subs_map
 
 
-# {{{ sum_mem_access_to_bytes
+# {{{ get_synchronization_poly
 
-def sum_mem_access_to_bytes(m):
-    """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping
+def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
 
-    **{(** :class:`string` **,** :class:`string` **)**
-    **:** :class:`islpy.PwQPolynomial` **}**
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
 
-    i.e., aggregate the transfer numbers for all types into a single byte count.
     """
-
-    result = {}
-    for (dtype, kind, direction), v in m.items():
-        new_key = (kind, direction)
-        bytes_transferred = int(dtype.itemsize) * v
-        if new_key in result:
-            result[new_key] += bytes_transferred
-        else:
-            result[new_key] = bytes_transferred
-
-    return result
+    warn_with_kernel(knl, "depricated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
+    return get_synchronization_map(knl)
 
 # }}}
 
 
-# {{{ get_synchronization_poly
-
-def get_synchronization_poly(knl):
+def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
@@ -804,8 +1296,8 @@ def get_synchronization_poly(knl):
     :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
-            :class:`islpy.PwQPolynomial` holding the number of such events
-            per thread.
+            :class:`islpy.PwQPolynomial` holding the number of events per
+            thread.
 
             Possible keys include ``barrier_local``, ``barrier_global``
             (if supported by the target) and ``kernel_launch``.
@@ -814,9 +1306,9 @@ def get_synchronization_poly(knl):
 
         # (first create loopy kernel and specify array data types)
 
-        barrier_poly = get_barrier_poly(knl)
+        sync_map = get_synchronization_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_count = barrier_poly.eval_with_dict(params)
+        barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
 
         # (now use this count to predict performance)
 
@@ -854,8 +1346,8 @@ def get_synchronization_poly(knl):
                 iname_list.pop()
 
         elif isinstance(sched_item, Barrier):
-            result = result + ToCountMap(
-                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})
+            result = result + ToCountMap({"barrier_%s" % sched_item.kind:
+                                          get_count_poly(iname_list)})
 
         elif isinstance(sched_item, CallKernel):
             result = result + ToCountMap(
@@ -868,9 +1360,8 @@ def get_synchronization_poly(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    return result.dict
-
-# }}}
+    #return result.count_map #TODO is this change okay?
+    return result
 
 
 # {{{ gather_access_footprints
@@ -881,7 +1372,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
     of each the array *var_name* are read/written (where
     *direction* is either ``read`` or ``write``.
 
-    :arg ignore_uncountable: If *True*, an error will be raised for
+    :arg ignore_uncountable: If *False*, an error will be raised for
         accesses on which the footprint cannot be determined (e.g.
         data-dependent or nonlinear indices)
     """
@@ -905,7 +1396,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
 
         insn_inames = kernel.insn_inames(insn)
         inames_domain = kernel.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(insn_inames,
+                                                   [dim_type.set]))
 
         afg = AccessFootprintGatherer(kernel, domain,
                 ignore_uncountable=ignore_uncountable)
@@ -947,7 +1439,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         kernel = preprocess_kernel(kernel)
 
     result = {}
-    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)
+    fp = gather_access_footprints(kernel,
+                                  ignore_uncountable=ignore_uncountable)
 
     for key, var_fp in fp.items():
         vname, direction = key
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 5b5b2477651c4026cfb4b0618481fbb8b3710728..430c651589939a1001432bd8db413cb5902b14a6 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -28,11 +28,10 @@ THE SOFTWARE.
 import six
 from six.moves import range, zip, reduce, intern
 
-from pytools import memoize, memoize_method, Record
+from pytools import memoize, memoize_method, ImmutableRecord
 import pytools.lex
 
-from pymbolic.primitives import (
-        Leaf, Expression, Variable, CommonSubexpression)
+import pymbolic.primitives as p
 
 from pymbolic.mapper import (
         CombineMapper as CombineMapperBase,
@@ -83,11 +82,11 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_reduction(self, expr, *args):
-        mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames]
+        mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames]
 
         new_inames = []
         for iname, new_sym_iname in zip(expr.inames, mapped_inames):
-            if not isinstance(new_sym_iname, Variable):
+            if not isinstance(new_sym_iname, p.Variable):
                 from loopy.diagnostic import LoopyError
                 raise LoopyError("%s did not map iname '%s' to a variable"
                         % (type(self).__name__, iname))
@@ -253,7 +252,7 @@ class DependencyMapper(DependencyMapperBase):
 
     def map_reduction(self, expr):
         return (self.rec(expr.expr)
-                - set(Variable(iname) for iname in expr.inames))
+                - set(p.Variable(iname) for iname in expr.inames))
 
     def map_tagged_variable(self, expr):
         return set([expr])
@@ -303,7 +302,7 @@ class SubstitutionRuleExpander(IdentityMapper):
 
 # {{{ loopy-specific primitives
 
-class Literal(Leaf):
+class Literal(p.Leaf):
     """A literal to be used during code generation."""
 
     def __init__(self, s):
@@ -320,7 +319,7 @@ class Literal(Leaf):
     mapper_method = "map_literal"
 
 
-class ArrayLiteral(Leaf):
+class ArrayLiteral(p.Leaf):
     "An array literal."
 
     # Currently only used after loopy -> C expression translation.
@@ -339,7 +338,7 @@ class ArrayLiteral(Leaf):
     mapper_method = "map_array_literal"
 
 
-class HardwareAxisIndex(Leaf):
+class HardwareAxisIndex(p.Leaf):
     def __init__(self, axis):
         self.axis = axis
 
@@ -360,7 +359,7 @@ class LocalHardwareAxisIndex(HardwareAxisIndex):
     mapper_method = "map_local_hw_index"
 
 
-class FunctionIdentifier(Leaf):
+class FunctionIdentifier(p.Leaf):
     """A base class for symbols representing functions."""
 
     init_arg_names = ()
@@ -371,13 +370,13 @@ class FunctionIdentifier(Leaf):
     mapper_method = intern("map_loopy_function_identifier")
 
 
-class TypedCSE(CommonSubexpression):
+class TypedCSE(p.CommonSubexpression):
     """A :class:`pymbolic.primitives.CommonSubexpression` annotated with
     a :class:`numpy.dtype`.
     """
 
     def __init__(self, child, prefix=None, dtype=None):
-        CommonSubexpression.__init__(self, child, prefix)
+        super(TypedCSE, self).__init__(child, prefix)
         self.dtype = dtype
 
     def __getinitargs__(self):
@@ -387,7 +386,7 @@ class TypedCSE(CommonSubexpression):
         return dict(dtype=self.dtype)
 
 
-class TypeAnnotation(Expression):
+class TypeAnnotation(p.Expression):
     def __init__(self, type, child):
         super(TypeAnnotation, self).__init__()
         self.type = type
@@ -399,7 +398,7 @@ class TypeAnnotation(Expression):
     mapper_method = intern("map_type_annotation")
 
 
-class TaggedVariable(Variable):
+class TaggedVariable(p.Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
     may then be used to address these uses--such as by prefetching only
@@ -409,7 +408,7 @@ class TaggedVariable(Variable):
     init_arg_names = ("name", "tag")
 
     def __init__(self, name, tag):
-        Variable.__init__(self, name)
+        super(TaggedVariable, self).__init__(name)
         self.tag = tag
 
     def __getinitargs__(self):
@@ -421,7 +420,7 @@ class TaggedVariable(Variable):
     mapper_method = intern("map_tagged_variable")
 
 
-class Reduction(Expression):
+class Reduction(p.Expression):
     """Represents a reduction operation on :attr:`expr`
     across :attr:`inames`.
 
@@ -451,13 +450,13 @@ class Reduction(Expression):
         if isinstance(inames, str):
             inames = tuple(iname.strip() for iname in inames.split(","))
 
-        elif isinstance(inames, Variable):
+        elif isinstance(inames, p.Variable):
             inames = (inames,)
 
         assert isinstance(inames, tuple)
 
         def strip_var(iname):
-            if isinstance(iname, Variable):
+            if isinstance(iname, p.Variable):
                 iname = iname.name
 
             assert isinstance(iname, str)
@@ -501,7 +500,7 @@ class Reduction(Expression):
     mapper_method = intern("map_reduction")
 
 
-class LinearSubscript(Expression):
+class LinearSubscript(p.Expression):
     """Represents a linear index into a multi-dimensional array, completely
     ignoring any multi-dimensional layout.
     """
@@ -521,7 +520,7 @@ class LinearSubscript(Expression):
     mapper_method = intern("map_linear_subscript")
 
 
-class RuleArgument(Expression):
+class RuleArgument(p.Expression):
     """Represents a (numbered) argument of a :class:`loopy.SubstitutionRule`.
     Only used internally in the rule-aware mappers to match subst rules
     independently of argument names.
@@ -554,13 +553,13 @@ def get_dependencies(expr):
 def parse_tagged_name(expr):
     if isinstance(expr, TaggedVariable):
         return expr.name, expr.tag
-    elif isinstance(expr, Variable):
+    elif isinstance(expr, p.Variable):
         return expr.name, None
     else:
         raise RuntimeError("subst rule name not understood: %s" % expr)
 
 
-class ExpansionState(Record):
+class ExpansionState(ImmutableRecord):
     """
     .. attribute:: kernel
     .. attribute:: instruction
@@ -590,7 +589,7 @@ class SubstitutionRuleRenamer(IdentityMapper):
         self.renames = renames
 
     def map_call(self, expr):
-        if not isinstance(expr.function, Variable):
+        if not isinstance(expr.function, p.Variable):
             return IdentityMapper.map_call(self, expr)
 
         name, tag = parse_tagged_name(expr.function)
@@ -600,7 +599,7 @@ class SubstitutionRuleRenamer(IdentityMapper):
             return IdentityMapper.map_call(self, expr)
 
         if tag is None:
-            sym = Variable(new_name)
+            sym = p.Variable(new_name)
         else:
             sym = TaggedVariable(new_name, tag)
 
@@ -614,7 +613,7 @@ class SubstitutionRuleRenamer(IdentityMapper):
             return IdentityMapper.map_variable(self, expr)
 
         if tag is None:
-            return Variable(new_name)
+            return p.Variable(new_name)
         else:
             return TaggedVariable(new_name, tag)
 
@@ -760,7 +759,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
             return self.map_substitution(name, tag, (), expn_state)
 
     def map_call(self, expr, expn_state):
-        if not isinstance(expr.function, Variable):
+        if not isinstance(expr.function, p.Variable):
             return IdentityMapper.map_call(self, expr, expn_state)
 
         name, tag = parse_tagged_name(expr.function)
@@ -803,7 +802,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
                 name, rule.arguments, result)
 
         if tag is None:
-            sym = Variable(new_name)
+            sym = p.Variable(new_name)
         else:
             sym = TaggedVariable(new_name, tag)
 
@@ -920,7 +919,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
     def _parse_reduction(self, operation, inames, red_expr,
             allow_simultaneous=False):
-        if isinstance(inames, Variable):
+        if isinstance(inames, p.Variable):
             inames = (inames,)
 
         if not isinstance(inames, (tuple)):
@@ -929,7 +928,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
         processed_inames = []
         for iname in inames:
-            if not isinstance(iname, Variable):
+            if not isinstance(iname, p.Variable):
                 raise TypeError("iname argument to reduce() must be a symbol "
                         "or a tuple or a tuple of symbols")
 
@@ -941,22 +940,20 @@ class FunctionToPrimitiveMapper(IdentityMapper):
     def map_call(self, expr):
         from loopy.library.reduction import parse_reduction_op
 
-        from pymbolic.primitives import Variable
-        if not isinstance(expr.function, Variable):
+        if not isinstance(expr.function, p.Variable):
             return IdentityMapper.map_call(self, expr)
 
         name = expr.function.name
         if name == "cse":
-            from pymbolic.primitives import CommonSubexpression
             if len(expr.parameters) in [1, 2]:
                 if len(expr.parameters) == 2:
-                    if not isinstance(expr.parameters[1], Variable):
+                    if not isinstance(expr.parameters[1], p.Variable):
                         raise TypeError("second argument to cse() must be a symbol")
                     tag = expr.parameters[1].name
                 else:
                     tag = None
 
-                return CommonSubexpression(
+                return p.CommonSubexpression(
                         self.rec(expr.parameters[0]), tag)
             else:
                 raise TypeError("cse takes two arguments")
@@ -965,7 +962,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             if len(expr.parameters) == 3:
                 operation, inames, red_expr = expr.parameters
 
-                if not isinstance(operation, Variable):
+                if not isinstance(operation, p.Variable):
                     raise TypeError("operation argument to reduce() "
                             "must be a symbol")
 
@@ -1098,8 +1095,7 @@ class ArrayAccessFinder(CombineMapper):
         return set()
 
     def map_subscript(self, expr):
-        from pymbolic.primitives import Variable
-        assert isinstance(expr.aggregate, Variable)
+        assert isinstance(expr.aggregate, p.Variable)
 
         if self.tgt_vector_name is None \
                 or expr.aggregate.name == self.tgt_vector_name:
@@ -1142,12 +1138,17 @@ def pw_aff_to_expr(pw_aff, int_ok=False):
         return pw_aff
 
     pieces = pw_aff.get_pieces()
+    last_expr = aff_to_expr(pieces[-1][1])
 
-    if len(pieces) != 1:
-        raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances")
+    pairs = [(set_to_cond_expr(constr_set), aff_to_expr(aff))
+             for constr_set, aff in pieces[:-1]]
 
-    (set, aff), = pieces
-    return aff_to_expr(aff)
+    from pymbolic.primitives import If
+    expr = last_expr
+    for condition, then_expr in reversed(pairs):
+        expr = If(condition, then_expr, expr)
+
+    return expr
 
 # }}}
 
@@ -1255,7 +1256,7 @@ def simplify_using_aff(kernel, expr):
 # }}}
 
 
-# {{{ expression <-> constraint conversion
+# {{{ expression/set <-> constraint conversion
 
 def eq_constraint_from_expr(space, expr):
     return isl.Constraint.equality_from_aff(aff_from_expr(space, expr))
@@ -1265,7 +1266,7 @@ def ineq_constraint_from_expr(space, expr):
     return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr))
 
 
-def constraint_to_expr(cns):
+def constraint_to_cond_expr(cns):
     # Looks like this is ok after all--get_aff() performs some magic.
     # Not entirely sure though... FIXME
     #
@@ -1284,6 +1285,39 @@ def constraint_to_expr(cns):
 # }}}
 
 
+# {{{ set_to_cond_expr
+
+def basic_set_to_cond_expr(isl_basicset):
+    constrs = []
+    for constr in isl_basicset.get_constraints():
+        constrs.append(constraint_to_cond_expr(constr))
+
+    if len(constrs) == 0:
+        raise ValueError("may not be called on universe")
+    elif len(constrs) == 1:
+        constr, = constrs
+        return constr
+    else:
+        return p.LogicalAnd(tuple(constrs))
+
+
+def set_to_cond_expr(isl_set):
+    conjs = []
+    for isl_basicset in isl_set.get_basic_sets():
+        conjs.append(basic_set_to_cond_expr(isl_basicset))
+
+    if len(conjs) == 0:
+        raise ValueError("may not be called on universe")
+    elif len(conjs) == 1:
+        conj, = conjs
+        return conj
+    else:
+        return p.LogicalOr(tuple(conjs))
+
+
+# }}}
+
+
 # {{{ Reduction callback mapper
 
 class ReductionCallbackMapper(IdentityMapper):
@@ -1318,10 +1352,9 @@ class IndexVariableFinder(CombineMapper):
     def map_subscript(self, expr):
         idx_vars = DependencyMapper()(expr.index)
 
-        from pymbolic.primitives import Variable
         result = set()
         for idx_var in idx_vars:
-            if isinstance(idx_var, Variable):
+            if isinstance(idx_var, p.Variable):
                 result.add(idx_var.name)
             else:
                 raise RuntimeError("index variable not understood: %s" % idx_var)
@@ -1432,8 +1465,7 @@ class AccessRangeMapper(WalkMapper):
         domain = self.kernel.get_inames_domain(inames)
         WalkMapper.map_subscript(self, expr, inames)
 
-        from pymbolic.primitives import Variable
-        assert isinstance(expr.aggregate, Variable)
+        assert isinstance(expr.aggregate, p.Variable)
 
         if expr.aggregate.name != self.arg_name:
             return
@@ -1476,8 +1508,7 @@ def is_expression_equal(a, b):
     if a == b:
         return True
 
-    from pymbolic.primitives import Expression
-    if isinstance(a, Expression) or isinstance(b, Expression):
+    if isinstance(a, p.Expression) or isinstance(b, p.Expression):
         if a is None or b is None:
             return False
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 409b9badb639c500e70404e781036b2e39bf333f..5d5743bae322fc59c989cafd85122c8ca619c422 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -216,6 +216,9 @@ class ASTBuilderBase(object):
     def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
         raise NotImplementedError()
 
+    def emit_declaration_scope(self, codegen_state, inner):
+        raise NotImplementedError()
+
     def emit_blank_line(self):
         raise NotImplementedError()
 
@@ -267,6 +270,10 @@ class DummyHostASTBuilder(ASTBuilderBase):
     def ast_block_class(self):
         return _DummyASTBlock
 
+    @property
+    def ast_block_scope_class(self):
+        return _DummyASTBlock
+
     def emit_assignment(self, codegen_state, insn):
         return None
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 50ae5856bce6eab8cf874242558a6c34f63a2d79..be83ec90c4720f10876e1a5e47a43c429fc40aeb 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -1,4 +1,4 @@
-"""OpenCL target independent of PyOpenCL."""
+"""Plain C target and base for other C-family languages."""
 
 from __future__ import division, absolute_import
 
@@ -29,10 +29,11 @@ import six
 import numpy as np  # noqa
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError
-from cgen import Pointer
+from cgen import Pointer, NestedDeclarator, Block
 from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
+import pymbolic.primitives as p
 
 from pytools import memoize_method
 
@@ -131,6 +132,16 @@ class POD(Declarator):
 
     mapper_method = "map_loopy_pod"
 
+
+class ScopingBlock(Block):
+    """A block that is mandatory for scoping and may not be simplified away
+    by :func:`loopy.codegen.results.merge_codegen_results`.
+    """
+
+
+class FunctionDeclarationWrapper(NestedDeclarator):
+    mapper_method = "map_function_decl_wrapper"
+
 # }}}
 
 
@@ -202,6 +213,10 @@ class CASTIdentityMapper(CASTIdentityMapperBase):
     def map_loopy_pod(self, node, *args, **kwargs):
         return type(node)(node.ast_builder, node.dtype, node.name)
 
+    def map_function_decl_wrapper(self, node, *args, **kwargs):
+        return FunctionDeclarationWrapper(
+                self.rec(node.subdecl, *args, **kwargs))
+
 
 class SubscriptSubsetCounter(IdentityMapper):
     def __init__(self, subset_counters):
@@ -333,7 +348,7 @@ class CASTBuilder(ASTBuilderBase):
                                 index_dtype=kernel.index_dtype)
                 decl = self.wrap_global_constant(
                         self.get_temporary_decl(
-                            kernel, schedule_index, tv,
+                            codegen_state, schedule_index, tv,
                             decl_info))
 
                 if tv.initializer is not None:
@@ -377,10 +392,11 @@ class CASTBuilder(ASTBuilderBase):
         if self.target.fortran_abi:
             name += "_"
 
-        return FunctionDeclaration(
-                        Value("void", name),
-                        [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
-                            for idi in codegen_state.implemented_data_info])
+        return FunctionDeclarationWrapper(
+                FunctionDeclaration(
+                    Value("void", name),
+                    [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
+                        for idi in codegen_state.implemented_data_info]))
 
     def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import temp_var_scope
@@ -409,7 +425,8 @@ class CASTBuilder(ASTBuilderBase):
                     if tv.scope != temp_var_scope.GLOBAL:
                         decl = self.wrap_temporary_decl(
                                 self.get_temporary_decl(
-                                    kernel, schedule_index, tv, idi), tv.scope)
+                                    codegen_state, schedule_index, tv, idi),
+                                tv.scope)
 
                         if tv.initializer is not None:
                             decl = Initializer(decl, generate_array_literal(
@@ -467,12 +484,21 @@ class CASTBuilder(ASTBuilderBase):
                             idi.dtype.itemsize
                             * product(si for si in idi.shape))
 
+        ecm = self.get_expression_to_code_mapper(codegen_state)
+
         for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
             bs_var_decl = Value("char", bs_name)
             from pytools import single_valued
             bs_var_decl = self.wrap_temporary_decl(
                     bs_var_decl, single_valued(base_storage_to_scope[bs_name]))
-            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))
+
+            # FIXME: Could try to use isl knowledge to simplify max.
+            if all(isinstance(bs, int) for bs in bs_sizes):
+                bs_size_max = max(bs_sizes)
+            else:
+                bs_size_max = p.Max(tuple(bs_sizes))
+
+            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))
 
             alignment = max(base_storage_to_align_bytes[bs_name])
             bs_var_decl = AlignedAttribute(alignment, bs_var_decl)
@@ -493,6 +519,10 @@ class CASTBuilder(ASTBuilderBase):
         from cgen import Block
         return Block
 
+    @property
+    def ast_block_scope_class(self):
+        return ScopingBlock
+
     # }}}
 
     # {{{ code generation guts
@@ -509,7 +539,7 @@ class CASTBuilder(ASTBuilderBase):
         from loopy.target.c.codegen.expression import CExpressionToCodeMapper
         return CExpressionToCodeMapper()
 
-    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
+    def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
         if temp_var.read_only:
@@ -518,8 +548,10 @@ class CASTBuilder(ASTBuilderBase):
 
         if decl_info.shape:
             from cgen import ArrayOf
+            ecm = self.get_expression_to_code_mapper(codegen_state)
             temp_var_decl = ArrayOf(temp_var_decl,
-                    " * ".join(str(s) for s in decl_info.shape))
+                    ecm(p.flattened_product(decl_info.shape),
+                        prec=PREC_NONE, type_context="i"))
 
         return temp_var_decl
 
@@ -690,11 +722,9 @@ class CASTBuilder(ASTBuilderBase):
                 CExpression(self.get_c_expression_to_code_mapper(), result))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
+            lbound, ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
 
-        from loopy.symbolic import aff_to_expr
-
         from pymbolic import var
         from pymbolic.primitives import Comparison
         from pymbolic.mapper.stringifier import PREC_NONE
@@ -703,12 +733,12 @@ class CASTBuilder(ASTBuilderBase):
         return For(
                 InlineInitializer(
                     POD(self, iname_dtype, iname),
-                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                    ecm(lbound, PREC_NONE, "i")),
                 ecm(
                     Comparison(
                         var(iname),
                         "<=",
-                        aff_to_expr(static_ubound)),
+                        ubound),
                     PREC_NONE, "i"),
                 "++%s" % iname,
                 inner)
@@ -743,4 +773,45 @@ class CASTBuilder(ASTBuilderBase):
         return node
 
 
+# {{{ header generation
+
+class CFunctionDeclExtractor(CASTIdentityMapper):
+    def __init__(self):
+        self.decls = []
+
+    def map_expression(self, expr):
+        return expr
+
+    def map_function_decl_wrapper(self, node):
+        self.decls.append(node.subdecl)
+        return super(CFunctionDeclExtractor, self)\
+                .map_function_decl_wrapper(node)
+
+
+def generate_header(kernel, codegen_result=None):
+    """
+    :arg kernel: a :class:`loopy.LoopKernel`
+    :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult`
+    :returns: a list of AST nodes (which may have :func:`str`
+        called on them to produce a string) representing
+        function declarations for the generated device
+        functions.
+    """
+
+    if not isinstance(kernel.target, CTarget):
+        raise LoopyError(
+                'Header generation for non C-based languages are not implemented')
+
+    if codegen_result is None:
+        from loopy.codegen import generate_code_v2
+        codegen_result = generate_code_v2(kernel)
+
+    fde = CFunctionDeclExtractor()
+    for dev_prg in codegen_result.device_programs:
+        fde(dev_prg.ast)
+
+    return fde.decls
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 91c42c542f67412b749ce739c3fda56b3ead4d7f..68cc32e56be077c7e45d11b9e2aade86b04494cc 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -36,7 +36,8 @@ import pymbolic.primitives as p
 from pymbolic import var
 
 
-from loopy.expression import dtype_to_type_context, TypeInferenceMapper
+from loopy.expression import dtype_to_type_context
+from loopy.type_inference import TypeInferenceMapper
 
 from loopy.diagnostic import LoopyError, LoopyWarning
 from loopy.tools import is_integer
@@ -104,7 +105,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 self.infer_type(expr), needed_dtype,
                 RecursiveMapper.rec(self, expr, type_context))
 
-    def __call__(self, expr, prec, type_context=None, needed_dtype=None):
+    def __call__(self, expr, prec=None, type_context=None, needed_dtype=None):
+        if prec is None:
+            prec = PREC_NONE
+
         assert prec == PREC_NONE
         from loopy.target.c import CExpression
         return CExpression(
@@ -144,6 +148,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             from loopy.kernel.data import ValueArg
             if isinstance(arg, ValueArg) and self.fortran_abi:
                 postproc = lambda x: x[0]  # noqa
+        elif expr.name in self.kernel.temporary_variables:
+            temporary = self.kernel.temporary_variables[expr.name]
+            if temporary.base_storage:
+                postproc = lambda x: x[0]  # noqa
 
         result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name)
         if result is not None:
@@ -212,12 +220,15 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)):
             if len(access_info.subscripts) == 0:
-                if isinstance(ary, GlobalArg) or isinstance(ary, ConstantArg):
+                if (
+                        (isinstance(ary, (ConstantArg, GlobalArg)) or
+                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                     # unsubscripted global args are pointers
                     result = var(access_info.array_name)[0]
 
                 else:
                     # unsubscripted temp vars are scalars
+                    # (unless they use base_storage)
                     result = var(access_info.array_name)
 
             else:
@@ -675,6 +686,10 @@ class CExpressionToCodeMapper(RecursiveMapper):
         return f % tuple(
                 self.rec(i, prec) for i in iterable)
 
+    def join(self, joiner, iterable):
+        f = joiner.join("%s" for i in iterable)
+        return f % tuple(iterable)
+
     # }}}
 
     def map_constant(self, expr, prec):
@@ -769,9 +784,19 @@ class CExpressionToCodeMapper(RecursiveMapper):
                 enclosing_prec, PREC_LOGICAL_AND)
 
     def map_logical_or(self, expr, enclosing_prec):
-        return self.parenthesize_if_needed(
-                self.join_rec(" || ", expr.children, PREC_LOGICAL_OR),
-                enclosing_prec, PREC_LOGICAL_OR)
+        mapped_children = []
+        from pymbolic.primitives import LogicalAnd
+        for child in expr.children:
+            mapped_child = self.rec(child, PREC_LOGICAL_OR)
+            # clang warns on unparenthesized && within ||
+            if isinstance(child, LogicalAnd):
+                mapped_child = "(%s)" % mapped_child
+            mapped_children.append(mapped_child)
+
+        result = self.join(" || ", mapped_children)
+        if enclosing_prec > PREC_LOGICAL_OR:
+            result = "(%s)" % result
+        return result
 
     def map_sum(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_SUM
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index d31718f15ed563bba0b602e6017536b72b6deed0..2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -99,6 +99,7 @@ def _create_vector_types():
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
+
 _create_vector_types()
 
 
@@ -147,7 +148,7 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper):
     def _get_index_ctype(kernel):
         if kernel.index_dtype.numpy_dtype == np.int32:
             return "int32_t"
-        elif kernel.index_dtype.numpy_dtype == np.int32:
+        elif kernel.index_dtype.numpy_dtype == np.int64:
             return "int64_t"
         else:
             raise LoopyError("unexpected index type")
@@ -232,6 +233,10 @@ class CUDACASTBuilder(CASTBuilder):
         fdecl = super(CUDACASTBuilder, self).get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        assert isinstance(fdecl, FunctionDeclarationWrapper)
+        fdecl = fdecl.subdecl
+
         from cgen.cuda import CudaGlobal, CudaLaunchBounds
         fdecl = CudaGlobal(fdecl)
 
@@ -254,7 +259,7 @@ class CUDACASTBuilder(CASTBuilder):
 
             fdecl = CudaLaunchBounds(nthreads, fdecl)
 
-        return fdecl
+        return FunctionDeclarationWrapper(fdecl)
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (
@@ -313,7 +318,7 @@ class CUDACASTBuilder(CASTBuilder):
                     % scope)
 
     def wrap_global_constant(self, decl):
-        from cgen.opencl import CudaConstant
+        from cgen.cuda import CudaConstant
         return CudaConstant(decl)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 536a186e7ef62bd2644ba81a11cc61a2079ac2be..80a69bd00c99258b709ea18b2a716c339b888b02 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -32,6 +32,7 @@ from loopy.diagnostic import LoopyError
 from loopy.symbolic import Literal
 from pymbolic import var
 import pymbolic.primitives as p
+from pymbolic.mapper.stringifier import PREC_NONE
 
 from pytools import memoize_method
 
@@ -236,16 +237,19 @@ class ISPCASTBuilder(CASTBuilder):
         arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
 
         if codegen_state.is_generating_device_code:
-            return ISPCTask(
+            result = ISPCTask(
                         FunctionDeclaration(
                             Value("void", name),
                             arg_decls))
         else:
-            return ISPCExport(
+            result = ISPCExport(
                     FunctionDeclaration(
                         Value("void", name),
                         arg_decls))
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        return FunctionDeclarationWrapper(result)
+
     # }}}
 
     def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
@@ -295,7 +299,7 @@ class ISPCASTBuilder(CASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def get_temporary_decl(self, knl, sched_index, temp_var, decl_info):
+    def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info):
         from loopy.target.c import POD  # uses the correct complex type
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
@@ -306,13 +310,16 @@ class ISPCASTBuilder(CASTBuilder):
             # FIXME: This is a pretty coarse way of deciding what
             # private temporaries get duplicated. Refine? (See also
             # above in expr to code mapper)
-            _, lsize = knl.get_grid_size_upper_bounds_as_exprs()
+            _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs()
             shape = lsize + shape
 
         if shape:
             from cgen import ArrayOf
-            temp_var_decl = ArrayOf(temp_var_decl,
-                    " * ".join(str(s) for s in shape))
+            ecm = self.get_expression_to_code_mapper(codegen_state)
+            temp_var_decl = ArrayOf(
+                    temp_var_decl,
+                    ecm(p.flattened_product(shape),
+                        prec=PREC_NONE, type_context="i"))
 
         return temp_var_decl
 
@@ -465,23 +472,22 @@ class ISPCASTBuilder(CASTBuilder):
         return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
+            lbound, ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
 
-        from loopy.symbolic import aff_to_expr
         from loopy.target.c import POD
 
         from pymbolic.mapper.stringifier import PREC_NONE
-        from cgen import For, Initializer
+        from cgen import For, InlineInitializer
 
         from cgen.ispc import ISPCUniform
 
         return For(
-                Initializer(
+                InlineInitializer(
                     ISPCUniform(POD(self, iname_dtype, iname)),
-                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                    ecm(lbound, PREC_NONE, "i")),
                 ecm(
-                    p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)),
+                    p.Comparison(var(iname), "<=", ubound),
                     PREC_NONE, "i"),
                 "++%s" % iname,
                 inner)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index f0436099c6127e6426b03df2c48342b6ee99c67f..31cf7c6b648ebf370a17d8beb2538b9748ddb30a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -126,6 +126,7 @@ def _create_vector_types():
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
+
 _create_vector_types()
 
 
@@ -400,6 +401,10 @@ class OpenCLCASTBuilder(CASTBuilder):
         fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
+        from loopy.target.c import FunctionDeclarationWrapper
+        assert isinstance(fdecl, FunctionDeclarationWrapper)
+        fdecl = fdecl.subdecl
+
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
         fdecl = CLKernel(fdecl)
 
@@ -415,7 +420,7 @@ class OpenCLCASTBuilder(CASTBuilder):
 
             fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)
 
-        return fdecl
+        return FunctionDeclarationWrapper(fdecl)
 
     def generate_top_of_body(self, codegen_state):
         from loopy.kernel.data import ImageArg
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 540cad00036de046484357826781353a927d7497..61e8e4f396126e17123c1bf775dbfeee2fe21f0d 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -25,7 +25,7 @@ THE SOFTWARE.
 import six
 from six.moves import range, zip
 
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import ParameterFinderWarning
 from pytools.py_codegen import (
         Indentation, PythonFunctionGenerator)
@@ -610,7 +610,7 @@ def generate_invoker(kernel, codegen_result):
 
 # {{{ kernel executor
 
-class _CLKernelInfo(Record):
+class _CLKernelInfo(ImmutableRecord):
     pass
 
 
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 591161d818bf6691a0412b3a00d624f8b02dde5b..09a86665b7d949d7bf35b910cd2a6fd66109c1ec 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -29,10 +29,11 @@ import numpy as np
 
 from pymbolic.mapper import Mapper
 from pymbolic.mapper.stringifier import StringifyMapper
-from loopy.expression import TypeInferenceMapper
+from loopy.type_inference import TypeInferenceMapper
 from loopy.kernel.data import ValueArg
 from loopy.diagnostic import LoopyError  # noqa
 from loopy.target import ASTBuilderBase
+from genpy import Suite
 
 
 # {{{ expression to code
@@ -129,6 +130,30 @@ class ExpressionToPythonMapper(StringifyMapper):
     def map_local_hw_index(self, expr, enclosing_prec):
         raise LoopyError("plain Python does not have local hw axes")
 
+    def map_if(self, expr, enclosing_prec):
+        # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the
+        # operator precedence hierarchy (right above "or").
+        from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE
+        PREC_IFTHENELSE = PREC_LOGICAL_OR - 1
+
+        return self.parenthesize_if_needed(
+            "{then} if {cond} else {else_}".format(
+                then=self.rec(expr.then, PREC_IFTHENELSE),
+                cond=self.rec(expr.condition, PREC_IFTHENELSE),
+                else_=self.rec(expr.else_, PREC_IFTHENELSE)),
+            enclosing_prec, PREC_NONE)
+
+# }}}
+
+
+# {{{ genpy extensions
+
+class Collection(Suite):
+    def generate(self):
+        for item in self.contents:
+            for item_line in item.generate():
+                yield item_line
+
 # }}}
 
 
@@ -219,15 +244,19 @@ class PythonASTBuilderBase(ASTBuilderBase):
 
     @property
     def ast_block_class(self):
-        from genpy import Suite
         return Suite
 
+    @property
+    def ast_block_scope_class(self):
+        # Once a new version of genpy is released, switch to this:
+        # from genpy import Collection
+        # and delete the implementation above.
+        return Collection
+
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
+            lbound, ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
 
-        from loopy.symbolic import aff_to_expr
-
         from pymbolic.mapper.stringifier import PREC_NONE
         from genpy import For
 
@@ -235,8 +264,8 @@ class PythonASTBuilderBase(ASTBuilderBase):
                 (iname,),
                 "range(%s, %s + 1)"
                 % (
-                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i"),
-                    ecm(aff_to_expr(static_ubound), PREC_NONE, "i"),
+                    ecm(lbound, PREC_NONE, "i"),
+                    ecm(ubound, PREC_NONE, "i"),
                     ),
                 inner)
 
diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py
index 570b5efffb29e0ebb56b99444db19766127be596..f42fd3c8d2943bb37b75e9ef0003b88985950926 100644
--- a/loopy/transform/__init__.py
+++ b/loopy/transform/__init__.py
@@ -21,6 +21,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
-
-
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index 3c7bfed43b9bd02a4be3d71b2317cee94da75b4b..f4e6526a7b083f0b38dda1209b607aa38a62b68e 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -28,11 +28,11 @@ from islpy import dim_type
 from loopy.symbolic import (get_dependencies, SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
 
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
 from pymbolic import var
 
 
-class AccessDescriptor(Record):
+class AccessDescriptor(ImmutableRecord):
     """
     .. attribute:: identifier
 
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index b2c86c084f0c56ebfb6ec8ebe4f6f5e65c5fd37d..92cff7a507d672a3acc51a8abed572a04cb7e86a 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -500,7 +500,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         store_instruction = Assignment(
                     id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                     depends_on=frozenset(aar.modified_insn_ids),
-                    no_sync_with=frozenset([init_insn_id]),
+                    no_sync_with=frozenset([(init_insn_id, "any")]),
                     assignee=store_target,
                     expression=store_expression,
                     within_inames=(
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index bf6a6e1b98e6abbc4b483383f4bb9cf8b06bed1a..c35b5064365293ac78cdd01af537c9d28bd67193 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -79,8 +79,8 @@ __doc__ = """
 
 def set_loop_priority(kernel, loop_priority):
     from warnings import warn
-    warn("set_loop_priority is deprecated. Use prioritize_loops instead."
-         "Attention: A call to set_loop_priority will overwrite any previously"
+    warn("set_loop_priority is deprecated. Use prioritize_loops instead. "
+         "Attention: A call to set_loop_priority will overwrite any previously "
          "set priorities!", DeprecationWarning, stacklevel=2)
 
     if isinstance(loop_priority, str):
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 5566077128a3f2514f2f86d04748935e7b3ff18b..7c9c9688604179dce2aa7dcd6954d76a0df32cc7 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -76,21 +76,34 @@ def set_instruction_priority(kernel, insn_match, priority):
 
 # {{{ add_dependency
 
-def add_dependency(kernel, insn_match, dependency):
+def add_dependency(kernel, insn_match, depends_on):
     """Add the instruction dependency *dependency* to the instructions matched
     by *insn_match*.
 
-    *insn_match* may be any instruction id match understood by
+    *insn_match* and *depends_on* may be any instruction id match understood by
     :func:`loopy.match.parse_match`.
+
+    .. versionchanged:: 2016.3
+
+        Third argument renamed to *depends_on* for clarity, allowed to
+        be not just ID but also match expression.
     """
 
-    if dependency not in kernel.id_to_insn:
-        raise LoopyError("cannot add dependency on non-existent instruction ID '%s'"
-                % dependency)
+    if isinstance(depends_on, str) and depends_on in kernel.id_to_insn:
+        added_deps = frozenset([depends_on])
+    else:
+        added_deps = frozenset(
+                dep.id for dep in find_instructions(kernel, depends_on))
+
+    if not added_deps:
+        raise LoopyError("no instructions found matching '%s' "
+                "(to add as dependencies)" % depends_on)
+
+    matched = [False]
 
     def add_dep(insn):
         new_deps = insn.depends_on
-        added_deps = frozenset([dependency])
+        matched[0] = True
         if new_deps is None:
             new_deps = added_deps
         else:
@@ -98,7 +111,13 @@ def add_dependency(kernel, insn_match, dependency):
 
         return insn.copy(depends_on=new_deps)
 
-    return map_instructions(kernel, insn_match, add_dep)
+    result = map_instructions(kernel, insn_match, add_dep)
+
+    if not matched[0]:
+        raise LoopyError("no instructions found matching '%s' "
+                "(to which dependencies would be added)" % insn_match)
+
+    return result
 
 # }}}
 
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 5ab9dfab3c8ac0669c3e7eaf4091bb3ab4b0e2a2..a19e06ecdf7c9966501ebb9600ea4e01614363f4 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -799,7 +799,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     if temporary_scope == temp_var_scope.GLOBAL:
         barrier_insn_id = kernel.make_unique_instruction_id(
-                based_on=c_subst_name+"_b")
+                based_on=c_subst_name+"_barrier")
         from loopy.kernel.instruction import BarrierInstruction
         barrier_insn = BarrierInstruction(
                 id=barrier_insn_id,
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
new file mode 100644
index 0000000000000000000000000000000000000000..8706bc4da70b94ad678f07158e0a0f648fdd0030
--- /dev/null
+++ b/loopy/transform/save.py
@@ -0,0 +1,587 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from loopy.diagnostic import LoopyError
+import loopy as lp
+
+from loopy.kernel.data import auto
+from pytools import memoize_method, Record
+from loopy.schedule import (
+            EnterLoop, LeaveLoop, RunInstruction,
+            CallKernel, ReturnFromKernel, Barrier)
+
+from loopy.schedule.tools import (get_block_boundaries, InstructionQuery)
+
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: save_and_reload_temporaries
+"""
+
+
+# {{{ liveness analysis
+
+class LivenessResult(dict):
+
+    class InstructionResult(Record):
+        __slots__ = ["live_in", "live_out"]
+
+    @classmethod
+    def make_empty(cls, nscheditems):
+        return cls((idx, cls.InstructionResult(live_in=set(), live_out=set()))
+                   for idx in range(nscheditems))
+
+
+class LivenessAnalysis(object):
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.schedule = self.kernel.schedule
+
+    @memoize_method
+    def get_successor_relation(self):
+        successors = {}
+        block_bounds = get_block_boundaries(self.kernel.schedule)
+
+        for idx, (item, next_item) in enumerate(zip(
+                reversed(self.schedule),
+                reversed(self.schedule + [None]))):
+            sched_idx = len(self.schedule) - idx - 1
+
+            # Look at next_item
+            if next_item is None:
+                after = set()
+            elif isinstance(next_item, EnterLoop):
+                # Account for empty loop
+                loop_end = block_bounds[sched_idx + 1]
+                after = successors[loop_end] | set([sched_idx + 1])
+            elif isinstance(next_item, (LeaveLoop, RunInstruction,
+                    CallKernel, ReturnFromKernel, Barrier)):
+                after = set([sched_idx + 1])
+            else:
+                raise LoopyError("unexpected type of schedule item: {ty}"
+                    .format(ty=type(next_item).__name__))
+
+            # Look at item
+            if isinstance(item, LeaveLoop):
+                # Account for loop
+                loop_begin = block_bounds[sched_idx]
+                after |= set([loop_begin])
+            elif not isinstance(item, (EnterLoop, RunInstruction,
+                    CallKernel, ReturnFromKernel, Barrier)):
+                raise LoopyError("unexpected type of schedule item: {ty}"
+                    .format(ty=type(item).__name__))
+
+            successors[sched_idx] = after
+
+        return successors
+
+    def get_gen_and_kill_sets(self):
+        gen = dict((idx, set()) for idx in range(len(self.schedule)))
+        kill = dict((idx, set()) for idx in range(len(self.schedule)))
+
+        for sched_idx, sched_item in enumerate(self.schedule):
+            if not isinstance(sched_item, RunInstruction):
+                continue
+            insn = self.kernel.id_to_insn[sched_item.insn_id]
+            for var in insn.assignee_var_names():
+                if var not in self.kernel.temporary_variables:
+                    continue
+                if not insn.predicates:
+                    # Fully kills the liveness only when unconditional.
+                    kill[sched_idx].add(var)
+                if len(self.kernel.temporary_variables[var].shape) > 0:
+                    # For an array variable, all definitions generate a use as
+                    # well, because the write could be a partial write,
+                    # necessitating a reload of whatever is not written.
+                    #
+                    # We don't currently check if the write is a partial write
+                    # or a full write. Instead, we analyze the access
+                    # footprint later on to determine how much to reload/save.
+                    gen[sched_idx].add(var)
+            for var in insn.read_dependency_names():
+                if var not in self.kernel.temporary_variables:
+                    continue
+                gen[sched_idx].add(var)
+
+        return gen, kill
+
+    @memoize_method
+    def liveness(self):
+        logging.info("running liveness analysis")
+        successors = self.get_successor_relation()
+        gen, kill = self.get_gen_and_kill_sets()
+
+        # Fixed point iteration for liveness analysis
+        lr = LivenessResult.make_empty(len(self.schedule))
+
+        prev_lr = None
+
+        while prev_lr != lr:
+            from copy import deepcopy
+            prev_lr = deepcopy(lr)
+            for idx in range(len(self.schedule) - 1, -1, -1):
+                for succ in successors[idx]:
+                    lr[idx].live_out.update(lr[succ].live_in)
+                lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx])
+
+        logging.info("done running liveness analysis")
+
+        return lr
+
+    def print_liveness(self):
+        print(75 * "-")
+        print("LIVE IN:")
+        for sched_idx, sched_item in enumerate(self.schedule):
+            print("{item}: {{{vars}}}".format(
+                item=sched_idx,
+                vars=", ".join(sorted(self[sched_idx].live_in))))
+        print(75 * "-")
+        print("LIVE OUT:")
+        for sched_idx, sched_item in enumerate(self.schedule):
+            print("{item}: {{{vars}}}".format(
+                item=sched_idx,
+                vars=", ".join(sorted(self[sched_idx].live_out))))
+        print(75 * "-")
+
+    def __getitem__(self, sched_idx):
+        """
+        :arg insn: An instruction name or instance of
+            :class:`loopy.instruction.InstructionBase`
+
+        :returns: A :class:`LivenessResult` associated with `insn`
+        """
+        return self.liveness()[sched_idx]
+
+# }}}
+
+
+# {{{ save and reload implementation
+
+class TemporarySaver(object):
+
+    class PromotedTemporary(Record):
+        """
+        .. attribute:: name
+
+            The name of the new temporary.
+
+        .. attribute:: orig_temporary
+
+            The original temporary variable object.
+
+        .. attribute:: hw_inames
+
+            The common list of hw axes that define the original object.
+
+        .. attribute:: hw_dims
+
+            A list of expressions, to be added in front of the shape
+            of the promoted temporary value, corresponding to
+            hardware dimensions
+
+        .. attribute:: non_hw_dims
+
+            A list of expressions, to be added in front of the shape
+            of the promoted temporary value, corresponding to
+            non-hardware dimensions
+        """
+
+        @memoize_method
+        def as_variable(self):
+            temporary = self.orig_temporary
+            from loopy.kernel.data import TemporaryVariable, temp_var_scope
+            return TemporaryVariable(
+                name=self.name,
+                dtype=temporary.dtype,
+                scope=temp_var_scope.GLOBAL,
+                shape=self.new_shape)
+
+        @property
+        def new_shape(self):
+            return self.hw_dims + self.non_hw_dims
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.insn_query = InstructionQuery(kernel)
+        self.var_name_gen = kernel.get_var_name_generator()
+        self.insn_name_gen = kernel.get_instruction_id_generator()
+        # These fields keep track of updates to the kernel.
+        self.insns_to_insert = []
+        self.insns_to_update = {}
+        self.extra_args_to_add = {}
+        self.updated_iname_to_tag = {}
+        self.updated_temporary_variables = {}
+        self.saves_or_reloads_added = {}
+
+    @memoize_method
+    def auto_promote_temporary(self, temporary_name):
+        temporary = self.kernel.temporary_variables[temporary_name]
+
+        from loopy.kernel.data import temp_var_scope
+        if temporary.scope == temp_var_scope.GLOBAL:
+            # Nothing to be done for global temporaries (I hope)
+            return None
+
+        if temporary.base_storage is not None:
+            raise ValueError(
+                "Cannot promote temporaries with base_storage to global")
+
+        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
+        # is associated with. This is used for determining the shape of the
+        # global storage needed for saving and restoring the temporary across
+        # kernel calls.
+        #
+        # TODO: Make a policy decision about which dimensions to use. Currently,
+        # the code looks at each instruction that defines or uses the temporary,
+        # and takes the common set of hw-parallel tagged inames associated with
+        # these instructions.
+        #
+        # Furthermore, in the case of local temporaries, inames that are tagged
+        # hw-local do not contribute to the global storage shape.
+        hw_inames = self.insn_query.common_hw_inames(
+            self.insn_query.insns_reading_or_writing(temporary.name))
+
+        # We want hw_inames to be arranged according to the order:
+        #    g.0 < g.1 < ... < l.0 < l.1 < ...
+        # Sorting lexicographically accomplishes this.
+        hw_inames = sorted(hw_inames,
+            key=lambda iname: str(self.kernel.iname_to_tag[iname]))
+
+        # Calculate the sizes of the dimensions that get added in front for
+        # the global storage of the temporary.
+        hw_dims = []
+
+        backing_hw_inames = []
+
+        for iname in hw_inames:
+            tag = self.kernel.iname_to_tag[iname]
+            from loopy.kernel.data import LocalIndexTag
+            is_local_iname = isinstance(tag, LocalIndexTag)
+            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
+                # Restrict shape to that of group inames for locals.
+                continue
+            backing_hw_inames.append(iname)
+            from loopy.isl_helpers import static_max_of_pw_aff
+            from loopy.symbolic import aff_to_expr
+            hw_dims.append(
+                aff_to_expr(
+                    static_max_of_pw_aff(
+                        self.kernel.get_iname_bounds(iname).size, False)))
+
+        non_hw_dims = temporary.shape
+
+        if len(non_hw_dims) == 0 and len(hw_dims) == 0:
+            # Scalar not in hardware: ensure at least one dimension.
+            non_hw_dims = (1,)
+
+        backing_temporary = self.PromotedTemporary(
+            name=self.var_name_gen(temporary.name + "_save_slot"),
+            orig_temporary=temporary,
+            hw_dims=tuple(hw_dims),
+            non_hw_dims=non_hw_dims,
+            hw_inames=backing_hw_inames)
+
+        return backing_temporary
+
+    def save_or_reload_impl(self, temporary, subkernel, mode,
+                             promoted_temporary=lp.auto):
+        assert mode in ("save", "reload")
+
+        if promoted_temporary is auto:
+            promoted_temporary = self.auto_promote_temporary(temporary)
+
+        if promoted_temporary is None:
+            return
+
+        from loopy.kernel.tools import DomainChanger
+        dchg = DomainChanger(
+            self.kernel,
+            frozenset(
+                self.insn_query.inames_in_subkernel(subkernel) |
+                set(promoted_temporary.hw_inames)))
+
+        domain, hw_inames, dim_inames, iname_to_tag = \
+            self.augment_domain_for_save_or_reload(
+                dchg.domain, promoted_temporary, mode, subkernel)
+
+        self.kernel = dchg.get_kernel_with(domain)
+
+        save_or_load_insn_id = self.insn_name_gen(
+            "{name}.{mode}".format(name=temporary, mode=mode))
+
+        def subscript_or_var(agg, subscript=()):
+            from pymbolic.primitives import Subscript, Variable
+            if len(subscript) == 0:
+                return Variable(agg)
+            else:
+                return Subscript(
+                    Variable(agg),
+                    tuple(map(Variable, subscript)))
+
+        dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)]
+
+        args = (
+            subscript_or_var(
+                temporary, dim_inames_trunc),
+            subscript_or_var(
+                promoted_temporary.name, hw_inames + dim_inames))
+
+        if mode == "save":
+            args = reversed(args)
+
+        accessing_insns_in_subkernel = (
+            self.insn_query.insns_reading_or_writing(temporary) &
+            self.insn_query.insns_in_subkernel(subkernel))
+
+        if mode == "save":
+            depends_on = accessing_insns_in_subkernel
+            update_deps = frozenset()
+        elif mode == "reload":
+            depends_on = frozenset()
+            update_deps = accessing_insns_in_subkernel
+
+        pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel)
+
+        if pre_barrier is not None:
+            depends_on |= set([pre_barrier])
+
+        if post_barrier is not None:
+            update_deps |= set([post_barrier])
+
+        # Create the load / store instruction.
+        from loopy.kernel.data import Assignment
+        save_or_load_insn = Assignment(
+            *args,
+            id=save_or_load_insn_id,
+            within_inames=(
+                self.insn_query.inames_in_subkernel(subkernel) |
+                frozenset(hw_inames + dim_inames)),
+            within_inames_is_final=True,
+            depends_on=depends_on,
+            boostable=False,
+            boostable_into=frozenset())
+
+        if temporary not in self.saves_or_reloads_added:
+            self.saves_or_reloads_added[temporary] = set()
+        self.saves_or_reloads_added[temporary].add(save_or_load_insn_id)
+
+        self.insns_to_insert.append(save_or_load_insn)
+
+        for insn_id in update_deps:
+            insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id])
+            self.insns_to_update[insn_id] = insn.copy(
+                depends_on=insn.depends_on | frozenset([save_or_load_insn_id]))
+
+        self.updated_temporary_variables[promoted_temporary.name] = \
+            promoted_temporary.as_variable()
+
+        self.updated_iname_to_tag.update(iname_to_tag)
+
+    @memoize_method
+    def finish(self):
+        new_instructions = []
+
+        insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
+
+        # Add global no_sync_with between any added reloads and saves
+        from six import iteritems
+        for temporary, added_insns in iteritems(self.saves_or_reloads_added):
+            for insn_id in added_insns:
+                insn = insns_to_insert[insn_id]
+                insns_to_insert[insn_id] = insn.copy(
+                    no_sync_with=frozenset(
+                        (added_insn, "global") for added_insn in added_insns))
+
+        for orig_insn in self.kernel.instructions:
+            if orig_insn.id in self.insns_to_update:
+                new_instructions.append(self.insns_to_update[orig_insn.id])
+            else:
+                new_instructions.append(orig_insn)
+        new_instructions.extend(
+            sorted(insns_to_insert.values(), key=lambda insn: insn.id))
+
+        self.updated_iname_to_tag.update(self.kernel.iname_to_tag)
+        self.updated_temporary_variables.update(self.kernel.temporary_variables)
+
+        kernel = self.kernel.copy(
+            instructions=new_instructions,
+            iname_to_tag=self.updated_iname_to_tag,
+            temporary_variables=self.updated_temporary_variables)
+
+        from loopy.kernel.tools import assign_automatic_axes
+        return assign_automatic_axes(kernel)
+
+    def save(self, temporary, subkernel):
+        self.save_or_reload_impl(temporary, subkernel, "save")
+
+    def reload(self, temporary, subkernel):
+        self.save_or_reload_impl(temporary, subkernel, "reload")
+
+    def augment_domain_for_save_or_reload(self,
+            domain, promoted_temporary, mode, subkernel):
+        """
+        Add new axes to the domain corresponding to the dimensions of
+        `promoted_temporary`. These axes will be used in the save/
+        reload stage.
+        """
+        assert mode in ("save", "reload")
+        import islpy as isl
+
+        orig_temporary = promoted_temporary.orig_temporary
+        orig_dim = domain.dim(isl.dim_type.set)
+
+        # Tags for newly added inames
+        iname_to_tag = {}
+
+        # FIXME: Restrict size of new inames to access footprint.
+
+        # Add dimension-dependent inames.
+        dim_inames = []
+        domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims))
+
+        for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims):
+            new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}".
+                format(name=orig_temporary.name,
+                       mode=mode,
+                       dim=dim_idx,
+                       sk=subkernel))
+            domain = domain.set_dim_name(
+                isl.dim_type.set, orig_dim + dim_idx, new_iname)
+
+            if orig_temporary.is_local:
+                # If the temporary has local scope, then loads / stores can
+                # be done in parallel.
+                from loopy.kernel.data import AutoFitLocalIndexTag
+                iname_to_tag[new_iname] = AutoFitLocalIndexTag()
+
+            dim_inames.append(new_iname)
+
+            # Add size information.
+            aff = isl.affs_from_space(domain.space)
+            domain &= aff[0].le_set(aff[new_iname])
+            from loopy.symbolic import aff_from_expr
+            domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size))
+
+        # FIXME: Use promoted_temporary.hw_inames
+        hw_inames = []
+
+        # Add hardware inames duplicates.
+        for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
+            new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}".
+                format(name=orig_temporary.name,
+                       mode=mode,
+                       dim=t_idx,
+                       sk=subkernel))
+            hw_inames.append(new_iname)
+            iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname]
+
+        from loopy.isl_helpers import duplicate_axes
+        domain = duplicate_axes(
+            domain, promoted_temporary.hw_inames, hw_inames)
+
+        # The operations on the domain above return a Set object, but the
+        # underlying domain should be expressible as a single BasicSet.
+        domain_list = domain.get_basic_set_list()
+        assert domain_list.n_basic_set() == 1
+        domain = domain_list.get_basic_set(0)
+        return domain, hw_inames, dim_inames, iname_to_tag
+
+# }}}
+
+
+# {{{ auto save and reload across kernel calls
+
+def save_and_reload_temporaries(knl):
+    """
+    Add instructions to save and reload temporary variables that are live
+    across kernel calls.
+
+    The basic code transformation turns schedule segments::
+
+        t = <...>
+        <return followed by call>
+        <...> = t
+
+    into this code::
+
+        t = <...>
+        t_save_slot = t
+        <return followed by call>
+        t = t_save_slot
+        <...> = t
+
+    where `t_save_slot` is a newly-created global temporary variable.
+
+    :returns: The resulting kernel
+    """
+    liveness = LivenessAnalysis(knl)
+    saver = TemporarySaver(knl)
+
+    insn_query = InstructionQuery(knl)
+
+    for sched_idx, sched_item in enumerate(knl.schedule):
+
+        if isinstance(sched_item, CallKernel):
+            # Any written temporary that is live-out needs to be read into
+            # memory because of the potential for partial writes.
+            if sched_idx == 0:
+                # Kernel entry: nothing live
+                interesting_temporaries = set()
+            else:
+                interesting_temporaries = (
+                    insn_query.temporaries_read_or_written_in_subkernel(
+                        sched_item.kernel_name))
+
+            for temporary in liveness[sched_idx].live_out & interesting_temporaries:
+                logger.info("reloading {0} at entry of {1}"
+                        .format(temporary, sched_item.kernel_name))
+                saver.reload(temporary, sched_item.kernel_name)
+
+        elif isinstance(sched_item, ReturnFromKernel):
+            if sched_idx == len(knl.schedule) - 1:
+                # Kernel exit: nothing live
+                interesting_temporaries = set()
+            else:
+                interesting_temporaries = (
+                    insn_query.temporaries_written_in_subkernel(
+                        sched_item.kernel_name))
+
+            for temporary in liveness[sched_idx].live_in & interesting_temporaries:
+                logger.info("saving {0} before return of {1}"
+                        .format(temporary, sched_item.kernel_name))
+                saver.save(temporary, sched_item.kernel_name)
+
+    return saver.finish()
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index b2b76ae9f3a92d93feca2dc9b31591f215b9341e..79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -31,7 +31,7 @@ from loopy.symbolic import (
 from loopy.diagnostic import LoopyError
 from pymbolic.mapper.substitutor import make_subst_func
 
-from pytools import Record
+from pytools import ImmutableRecord
 from pymbolic import var
 
 
@@ -39,7 +39,7 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-class ExprDescriptor(Record):
+class ExprDescriptor(ImmutableRecord):
     __slots__ = ["insn", "expr", "unif_var_dict"]
 
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a31f011a0ce8e5403b54984eb45db0970a8370b0
--- /dev/null
+++ b/loopy/type_inference.py
@@ -0,0 +1,581 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+
+from pymbolic.mapper import CombineMapper
+import numpy as np
+
+from loopy.tools import is_integer
+from loopy.types import NumpyType
+
+from loopy.diagnostic import (
+        LoopyError,
+        TypeInferenceFailure, DependencyTypeInferenceFailure)
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+# {{{ type inference mapper
+
+class TypeInferenceMapper(CombineMapper):
+    def __init__(self, kernel, new_assignments=None):
+        """
+        :arg new_assignments: mapping from names to either
+            :class:`loopy.kernel.data.TemporaryVariable`
+            or
+            :class:`loopy.kernel.data.KernelArgument`
+            instances
+        """
+        self.kernel = kernel
+        if new_assignments is None:
+            new_assignments = {}
+        self.new_assignments = new_assignments
+        self.symbols_with_unknown_types = set()
+
+    def __call__(self, expr, return_tuple=False, return_dtype_set=False):
+        kwargs = {}
+        if return_tuple:
+            kwargs["return_tuple"] = True
+
+        result = super(TypeInferenceMapper, self).__call__(
+                expr, **kwargs)
+
+        assert isinstance(result, list)
+
+        if return_tuple:
+            for result_i in result:
+                assert isinstance(result_i, tuple)
+
+            assert return_dtype_set
+            return result
+
+        else:
+            if return_dtype_set:
+                return result
+            else:
+                if not result:
+                    raise DependencyTypeInferenceFailure(
+                            ", ".join(sorted(self.symbols_with_unknown_types)))
+
+                result, = result
+                return result
+
+    # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
+    # are Python-equal (for many common constants such as integers).
+
+    def copy(self):
+        return type(self)(self.kernel, self.new_assignments)
+
+    def with_assignments(self, names_to_vars):
+        new_ass = self.new_assignments.copy()
+        new_ass.update(names_to_vars)
+        return type(self)(self.kernel, new_ass)
+
+    @staticmethod
+    def combine(dtype_sets):
+        """
+        :arg dtype_sets: A list of lists, where each of the inner lists
+            consists of either zero or one type. An empty list is
+            consistent with any type. A list with a type requires
+            that an operation be valid in conjunction with that type.
+        """
+        dtype_sets = list(dtype_sets)
+
+        from loopy.types import LoopyType, NumpyType
+        assert all(
+                all(isinstance(dtype, LoopyType) for dtype in dtype_set)
+                for dtype_set in dtype_sets)
+        assert all(
+                0 <= len(dtype_set) <= 1
+                for dtype_set in dtype_sets)
+
+        if not all(
+                isinstance(dtype, NumpyType)
+                for dtype_set in dtype_sets
+                for dtype in dtype_set):
+            from pytools import is_single_valued, single_valued
+            if not is_single_valued(
+                    dtype
+                    for dtype_set in dtype_sets
+                    for dtype in dtype_set):
+                raise TypeInferenceFailure(
+                        "Nothing known about operations between '%s'"
+                        % ", ".join(str(dtype)
+                            for dtype_set in dtype_sets
+                            for dtype in dtype_set))
+
+            return single_valued(dtype
+                            for dtype_set in dtype_sets
+                            for dtype in dtype_set)
+
+        numpy_dtypes = [dtype.dtype
+                for dtype_set in dtype_sets
+                for dtype in dtype_set]
+
+        if not numpy_dtypes:
+            return []
+
+        result = numpy_dtypes.pop()
+        while numpy_dtypes:
+            other = numpy_dtypes.pop()
+
+            if result.fields is None and other.fields is None:
+                if (result, other) in [
+                        (np.int32, np.float32), (np.float32, np.int32)]:
+                    # numpy makes this a double. I disagree.
+                    result = np.dtype(np.float32)
+                else:
+                    result = (
+                            np.empty(0, dtype=result)
+                            + np.empty(0, dtype=other)
+                            ).dtype
+
+            elif result.fields is None and other.fields is not None:
+                # assume the non-native type takes over
+                # (This is used for vector types.)
+                result = other
+            elif result.fields is not None and other.fields is None:
+                # assume the non-native type takes over
+                # (This is used for vector types.)
+                pass
+            else:
+                if result is not other:
+                    raise TypeInferenceFailure(
+                            "nothing known about result of operation on "
+                            "'%s' and '%s'" % (result, other))
+
+        return [NumpyType(result)]
+
+    def map_sum(self, expr):
+        dtype_sets = []
+        small_integer_dtype_sets = []
+        for child in expr.children:
+            dtype_set = self.rec(child)
+            if is_integer(child) and abs(child) < 1024:
+                small_integer_dtype_sets.append(dtype_set)
+            else:
+                dtype_sets.append(dtype_set)
+
+        from pytools import all
+        if all(dtype.is_integral()
+                for dtype_set in dtype_sets
+                for dtype in dtype_set):
+            dtype_sets.extend(small_integer_dtype_sets)
+
+        return self.combine(dtype_sets)
+
+    map_product = map_sum
+
+    def map_quotient(self, expr):
+        n_dtype_set = self.rec(expr.numerator)
+        d_dtype_set = self.rec(expr.denominator)
+
+        dtypes = n_dtype_set + d_dtype_set
+
+        if all(dtype.is_integral() for dtype in dtypes):
+            # both integers
+            return [NumpyType(np.dtype(np.float64))]
+
+        else:
+            return self.combine([n_dtype_set, d_dtype_set])
+
+    def map_constant(self, expr):
+        if is_integer(expr):
+            for tp in [np.int32, np.int64]:
+                iinfo = np.iinfo(tp)
+                if iinfo.min <= expr <= iinfo.max:
+                    return [NumpyType(np.dtype(tp))]
+
+            else:
+                raise TypeInferenceFailure("integer constant '%s' too large" % expr)
+
+        dt = np.asarray(expr).dtype
+        if hasattr(expr, "dtype"):
+            return [NumpyType(expr.dtype)]
+        elif isinstance(expr, np.number):
+            # Numpy types are sized
+            return [NumpyType(np.dtype(type(expr)))]
+        elif dt.kind == "f":
+            # deduce the smaller type by default
+            return [NumpyType(np.dtype(np.float32))]
+        elif dt.kind == "c":
+            if np.complex64(expr) == np.complex128(expr):
+                # (COMPLEX_GUESS_LOGIC)
+                # No precision is lost by 'guessing' single precision, use that.
+                # This at least covers simple cases like '1j'.
+                return [NumpyType(np.dtype(np.complex64))]
+
+            # Codegen for complex types depends on exactly correct types.
+            # Refuse temptation to guess.
+            raise TypeInferenceFailure("Complex constant '%s' needs to "
+                    "be sized for type inference " % expr)
+        else:
+            raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
+
+    def map_subscript(self, expr):
+        return self.rec(expr.aggregate)
+
+    def map_linear_subscript(self, expr):
+        return self.rec(expr.aggregate)
+
+    def map_call(self, expr, return_tuple=False):
+        from pymbolic.primitives import Variable
+
+        identifier = expr.function
+        if isinstance(identifier, Variable):
+            identifier = identifier.name
+
+        if identifier in ["indexof", "indexof_vec"]:
+            return [self.kernel.index_dtype]
+
+        def none_if_empty(d):
+            if d:
+                d, = d
+                return d
+            else:
+                return None
+
+        arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters)
+        if None in arg_dtypes:
+            return []
+
+        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
+        if return_tuple:
+            if mangle_result is not None:
+                return [mangle_result.result_dtypes]
+        else:
+            if mangle_result is not None:
+                if len(mangle_result.result_dtypes) != 1 and not return_tuple:
+                    raise LoopyError("functions with more or fewer than one "
+                            "return value may only be used in direct assignments")
+
+                return [mangle_result.result_dtypes[0]]
+
+        raise RuntimeError("unable to resolve "
+                "function '%s' with %d given arguments"
+                % (identifier, len(arg_dtypes)))
+
+    def map_variable(self, expr):
+        if expr.name in self.kernel.all_inames():
+            return [self.kernel.index_dtype]
+
+        result = self.kernel.mangle_symbol(
+                self.kernel.target.get_device_ast_builder(),
+                expr.name)
+
+        if result is not None:
+            result_dtype, _ = result
+            return [result_dtype]
+
+        obj = self.new_assignments.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.arg_dict.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.temporary_variables.get(expr.name)
+
+        if obj is None:
+            raise TypeInferenceFailure("name not known in type inference: %s"
+                    % expr.name)
+
+        from loopy.kernel.data import TemporaryVariable, KernelArgument
+        import loopy as lp
+        if isinstance(obj, TemporaryVariable):
+            result = [obj.dtype]
+            if result[0] is lp.auto:
+                self.symbols_with_unknown_types.add(expr.name)
+                return []
+            else:
+                return result
+
+        elif isinstance(obj, KernelArgument):
+            result = [obj.dtype]
+            if result[0] is None:
+                self.symbols_with_unknown_types.add(expr.name)
+                return []
+            else:
+                return result
+
+        else:
+            raise RuntimeError("unexpected type inference "
+                    "object type for '%s'" % expr.name)
+
+    map_tagged_variable = map_variable
+
+    def map_lookup(self, expr):
+        agg_result = self.rec(expr.aggregate)
+        if not agg_result:
+            return agg_result
+
+        field = agg_result[0].numpy_dtype.fields[expr.name]
+        dtype = field[0]
+        return [NumpyType(dtype)]
+
+    def map_comparison(self, expr):
+        # "bool" is unusable because OpenCL's bool has indeterminate memory
+        # format.
+        return [NumpyType(np.dtype(np.int32))]
+
+    map_logical_not = map_comparison
+    map_logical_and = map_comparison
+    map_logical_or = map_comparison
+
+    def map_group_hw_index(self, expr, *args):
+        return [self.kernel.index_dtype]
+
+    def map_local_hw_index(self, expr, *args):
+        return [self.kernel.index_dtype]
+
+    def map_reduction(self, expr, return_tuple=False):
+        rec_result = self.rec(expr.expr)
+
+        if rec_result:
+            rec_result, = rec_result
+            result = expr.operation.result_dtypes(
+                    self.kernel, rec_result, expr.inames)
+        else:
+            result = expr.operation.result_dtypes(
+                    self.kernel, None, expr.inames)
+
+        if result is None:
+            return []
+
+        if return_tuple:
+            return [result]
+
+        else:
+            if len(result) != 1 and not return_tuple:
+                raise LoopyError("reductions with more or fewer than one "
+                        "return value may only be used in direct assignments")
+
+            return [result[0]]
+
+# }}}
+
+
+# {{{ infer single variable
+
+def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
+    if var_name in kernel.all_params():
+        return [kernel.index_dtype], []
+
+    def debug(s):
+        logger.debug("%s: %s" % (kernel.name, s))
+
+    dtype_sets = []
+
+    import loopy as lp
+
+    type_inf_mapper = type_inf_mapper.copy()
+
+    for writer_insn_id in kernel.writer_map().get(var_name, []):
+        writer_insn = kernel.id_to_insn[writer_insn_id]
+        if not isinstance(writer_insn, lp.MultiAssignmentBase):
+            continue
+
+        expr = subst_expander(writer_insn.expression)
+
+        debug("             via expr %s" % expr)
+        if isinstance(writer_insn, lp.Assignment):
+            result = type_inf_mapper(expr, return_dtype_set=True)
+        elif isinstance(writer_insn, lp.CallInstruction):
+            return_dtype_set = type_inf_mapper(expr, return_tuple=True,
+                    return_dtype_set=True)
+
+            result = []
+            for return_dtype_set in return_dtype_set:
+                result_i = None
+                found = False
+                for assignee, comp_dtype_set in zip(
+                        writer_insn.assignee_var_names(), return_dtype_set):
+                    if assignee == var_name:
+                        found = True
+                        result_i = comp_dtype_set
+                        break
+
+                assert found
+                if result_i is not None:
+                    result.append(result_i)
+
+        debug("             result: %s" % result)
+
+        dtype_sets.append(result)
+
+    if not dtype_sets:
+        return None, type_inf_mapper.symbols_with_unknown_types
+
+    result = type_inf_mapper.combine(dtype_sets)
+
+    return result, type_inf_mapper.symbols_with_unknown_types
+
+# }}}
+
+
+class _DictUnionView:
+    def __init__(self, children):
+        self.children = children
+
+    def get(self, key):
+        try:
+            return self[key]
+        except KeyError:
+            return None
+
+    def __getitem__(self, key):
+        for ch in self.children:
+            try:
+                return ch[key]
+            except KeyError:
+                pass
+
+        raise KeyError(key)
+
+
+# {{{ infer_unknown_types
+
+def infer_unknown_types(kernel, expect_completion=False):
+    """Infer types on temporaries and arguments."""
+
+    logger.debug("%s: infer types" % kernel.name)
+
+    def debug(s):
+        logger.debug("%s: %s" % (kernel.name, s))
+
+    unexpanded_kernel = kernel
+    if kernel.substitutions:
+        from loopy.transform.subst import expand_subst
+        kernel = expand_subst(kernel)
+
+    new_temp_vars = kernel.temporary_variables.copy()
+    new_arg_dict = kernel.arg_dict.copy()
+
+    # {{{ find names_with_unknown_types
+
+    # contains both arguments and temporaries
+    names_for_type_inference = []
+
+    import loopy as lp
+    for tv in six.itervalues(kernel.temporary_variables):
+        if tv.dtype is lp.auto:
+            names_for_type_inference.append(tv.name)
+
+    for arg in kernel.args:
+        if arg.dtype is None:
+            names_for_type_inference.append(arg.name)
+
+    # }}}
+
+    item_lookup = _DictUnionView([
+            new_temp_vars,
+            new_arg_dict
+            ])
+    type_inf_mapper = TypeInferenceMapper(kernel, item_lookup)
+
+    from loopy.symbolic import SubstitutionRuleExpander
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+
+    # {{{ work on type inference queue
+
+    from loopy.kernel.data import TemporaryVariable, KernelArgument
+
+    changed_during_last_queue_run = False
+    queue = names_for_type_inference[:]
+
+    failed_names = set()
+    while queue or changed_during_last_queue_run:
+        if not queue and changed_during_last_queue_run:
+            changed_during_last_queue_run = False
+            queue = names_for_type_inference[:]
+
+        name = queue.pop(0)
+        item = item_lookup[name]
+
+        debug("inferring type for %s %s" % (type(item).__name__, item.name))
+
+        result, symbols_with_unavailable_types = \
+                _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander)
+
+        failed = not result
+        if not failed:
+            new_dtype, = result
+            debug("     success: %s" % new_dtype)
+            if new_dtype != item.dtype:
+                debug("     changed from: %s" % item.dtype)
+                changed_during_last_queue_run = True
+
+                if isinstance(item, TemporaryVariable):
+                    new_temp_vars[name] = item.copy(dtype=new_dtype)
+                elif isinstance(item, KernelArgument):
+                    new_arg_dict[name] = item.copy(dtype=new_dtype)
+                else:
+                    raise LoopyError("unexpected item type in type inference")
+        else:
+            debug("     failure")
+
+        if failed:
+            if item.name in failed_names:
+                # this item has failed before, give up.
+                advice = ""
+                if symbols_with_unavailable_types:
+                    advice += (
+                            " (need type of '%s'--check for missing arguments)"
+                            % ", ".join(symbols_with_unavailable_types))
+
+                if expect_completion:
+                    raise LoopyError(
+                            "could not determine type of '%s'%s"
+                            % (item.name, advice))
+
+                else:
+                    # We're done here.
+                    break
+
+            # remember that this item failed
+            failed_names.add(item.name)
+
+            if set(queue) == failed_names:
+                # We did what we could...
+                print(queue, failed_names, item.name)
+                assert not expect_completion
+                break
+
+            # can't infer type yet, put back into queue
+            queue.append(name)
+        else:
+            # we've made progress, reset failure markers
+            failed_names = set()
+
+    # }}}
+
+    return unexpanded_kernel.copy(
+            temporary_variables=new_temp_vars,
+            args=[new_arg_dict[arg.name] for arg in kernel.args],
+            )
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/types.py b/loopy/types.py
index b897d9f700b198e73d95a09c7d459ed2d7f877b1..f095d1d58f9eaebb7dcc9c8d41afa73951f2ba84 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -177,7 +177,8 @@ class AtomicNumpyType(NumpyType, AtomicType):
 # }}}
 
 
-def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False):
+def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False,
+        target=None):
     from loopy.kernel.data import auto
     if allow_none and dtype is None:
         return dtype
@@ -192,10 +193,13 @@ def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False):
         except Exception:
             pass
 
+    if numpy_dtype is None and target is not None and isinstance(dtype, str):
+        numpy_dtype = target.get_dtype_registry().get_or_register_dtype(dtype)
+
     if isinstance(dtype, LoopyType):
         if for_atomic:
             if isinstance(dtype, NumpyType):
-                return AtomicNumpyType(dtype.dtype)
+                return AtomicNumpyType(dtype.dtype, target=target)
             elif not isinstance(dtype, AtomicType):
                 raise LoopyError("do not know how to convert '%s' to an atomic type"
                         % dtype)
@@ -204,9 +208,9 @@ def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False):
 
     elif numpy_dtype is not None:
         if for_atomic:
-            return AtomicNumpyType(numpy_dtype)
+            return AtomicNumpyType(numpy_dtype, target=target)
         else:
-            return NumpyType(numpy_dtype)
+            return NumpyType(numpy_dtype, target=target)
 
     else:
         raise TypeError("dtype must be a LoopyType, or convertible to one, "
diff --git a/loopy/version.py b/loopy/version.py
index aa3e7abee41a05595985df574da52c024b52dcb5..f7d157f650304a83164e11763279d3c5eabbc4c0 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v44-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v49-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 5c8f377a6855d0cb3e2f7e759d1dc0b314f31817..a941eecd2b58daf413830fc22500179d3e8a8cf1 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2016.1",
+          "pytools>=2016.2.6",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_apps.py b/test/test_apps.py
index 790a44f6acac72e4fa6fe04a45f32813e6204bb9..9eab3fdb1fbc152b65344362d39766793d372d90 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -502,6 +502,112 @@ def test_lbm(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20})
 
 
+def test_fd_demo():
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i,j<n}",
+        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
+                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
+                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
+    #assumptions="n mod 16=0")
+    knl = lp.split_iname(knl,
+            "i", 16, outer_tag="g.1", inner_tag="l.1")
+    knl = lp.split_iname(knl,
+            "j", 16, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.add_prefetch(knl, "u",
+            ["i_inner", "j_inner"],
+            fetch_bounding_box=True)
+
+    #n = 1000
+    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)
+
+    knl = lp.set_options(knl, write_cl=True)
+    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
+    code, inf = lp.generate_code(knl)
+    print(code)
+
+    assert "double" not in code
+
+
+def test_fd_1d(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        "result[i] = u[i+1]-u[i]")
+
+    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16)
+    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
+    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
+    knl = lp.assume(knl, "n mod 16 = 0")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=2048))
+
+
+def test_poisson_fem(ctx_factory):
+    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
+    ctx = ctx_factory()
+
+    nbf = 5
+    nqp = 5
+    sdim = 3
+
+    knl = lp.make_kernel(
+            "{ [c,i,j,k,ell,ell2,ell3]: \
+            0 <= c < nels and \
+            0 <= i < nbf and \
+            0 <= j < nbf and \
+            0 <= k < nqp and \
+            0 <= ell,ell2 < sdim}",
+            """
+            dpsi(bf,k0,dir) := \
+                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
+            Ael[c,i,j] = \
+                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
+            """,
+            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
+
+    print(knl)
+
+    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
+
+    ref_knl = knl
+
+    knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])
+
+    def variant_1(knl):
+        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
+        knl = lp.prioritize_loops(knl, "c,i,j")
+        return knl
+
+    def variant_2(knl):
+        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
+        knl = lp.prioritize_loops(knl, "c,i,j")
+        return knl
+
+    def add_types(knl):
+        return lp.add_and_infer_dtypes(knl, dict(
+            w=np.float32,
+            J=np.float32,
+            DPsi=np.float32,
+            DFinv=np.float32,
+            ))
+
+    for variant in [
+            #variant_1,
+            variant_2
+            ]:
+        knl = variant(knl)
+
+        lp.auto_test_vs_ref(
+                add_types(ref_knl), ctx, add_types(knl),
+                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_isl.py b/test/test_isl.py
index 3bd3d221e54df685238cfd1532d2b32662aac99f..f793b1fa99f8768ff4e2fcfaa02aa87119ffcc92 100644
--- a/test/test_isl.py
+++ b/test/test_isl.py
@@ -44,6 +44,13 @@ def test_aff_to_expr_2():
     assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2)
 
 
+def test_pw_aff_to_conditional_expr():
+    from loopy.symbolic import pw_aff_to_expr
+    cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }")
+    expr = pw_aff_to_expr(cond)
+    assert str(expr) == "If(i == 0, 0, -1 + i)"
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 4c0ff097bc59e56de6268c0ddd67bd686465b5d8..c5b423936f998b3cc2ef66b07ef3f88aa398cd17 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1105,95 +1105,215 @@ def test_kernel_splitting_with_loop(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
+def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
+    from loopy.preprocess import preprocess_kernel
+    from loopy.schedule import get_one_scheduled_kernel
+
+    knl = preprocess_kernel(knl)
+    knl = get_one_scheduled_kernel(knl)
+
+    from loopy.transform.save import save_and_reload_temporaries
+    knl = save_and_reload_temporaries(knl)
+    knl = get_one_scheduled_kernel(knl)
+
+    if debug:
+        print(knl)
+        cgr = lp.generate_code_v2(knl)
+        print(cgr.device_code())
+        print(cgr.host_code())
+        1/0
+
+    _, (out,) = knl(queue, out_host=True)
+    assert (out == out_expect).all()
+
+
+@pytest.mark.parametrize("hw_loop", [True, False])
+def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<8 }",
+        """
+        for i
+            <>t = i
+            ... gbarrier
+            out[i] = t
+        end
+        """, seq_dependencies=True)
+
+    if hw_loop:
+        knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+
+
+def test_save_of_private_array(ctx_factory, debug=False):
     ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    pytest.xfail("spilling doesn't yet use local axes")
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<8 }",
+        """
+        for i
+            <>t[i] = i
+            ... gbarrier
+            out[i] = t[i]
+        end
+        """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t", "private")
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+
+
+def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-            "{ [i,k]: 0<=i<n and 0<=k<3 }",
-            """
-            for i, k
-                ... gbarrier
-                <> t_private_scalar = a[k,i+1]
-                <> t_private_array[i % 2] = a[k,i+1]
-                c[k,i] = a[k,i+1]
-                ... gbarrier
-                out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
+        "{ [i,j,k]: 0<=i,j,k<8 }",
+        """
+        for i
+            for j
+               <>t[j] = j
             end
-            """, seq_dependencies=True)
+            ... gbarrier
+            for k
+                out[i,k] = t[k]
+            end
+        end
+        """, seq_dependencies=True)
 
-    knl = lp.add_and_infer_dtypes(knl,
-            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
-    knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
-    knl = lp.set_temporary_scope(knl, "t_private_array", "private")
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+    knl = lp.set_temporary_scope(knl, "t", "private")
 
-    ref_knl = knl
+    save_and_reload_temporaries_test(
+        queue, knl, np.vstack((8 * (np.arange(8),))), debug)
 
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
+def test_save_of_private_multidim_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
+    knl = lp.make_kernel(
+        "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
+        """
+        for i
+            for j, k
+               <>t[j,k] = k
+            end
+            ... gbarrier
+            for l, m
+                out[i,l,m] = t[l,m]
+            end
+        end
+        """, seq_dependencies=True)
 
-    # map schedule onto host or device
-    print(knl)
+    knl = lp.set_temporary_scope(knl, "t", "private")
 
-    cgr = lp.generate_code_v2(knl)
+    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
-    assert len(cgr.device_programs) == 2
 
-    print(cgr.device_code())
-    print(cgr.host_code())
+def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    knl = lp.make_kernel(
+        "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
+        """
+        for i
+            for j, k
+               <>t[j,k] = k
+            end
+            ... gbarrier
+            for l, m
+                out[i,l,m] = t[l,m]
+            end
+        end
+        """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t", "private")
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
 
-def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
+@pytest.mark.parametrize("hw_loop", [True, False])
+def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False):
     ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-            "{ [i,k]: 0<=i<n and 0<=k<3 }",
+            "{ [i,j,k]: 0<=i,j,k<10 }",
             """
-            for i, k
-                ... gbarrier
-                <> t_local[i % 8,k] = i % 8
-                c[k,i] = a[k,i+1]
+            for i
+                for k
+                    <> t_arr[k] = k
+                end
+                <> t_scalar = 1
+                for j
+                    ... gbarrier
+                    out[j] = t_scalar
+                    ... gbarrier
+                    t_scalar = 10
+                end
                 ... gbarrier
-                out[k,i] = c[k,i] + t_local[i % 8,k]
+                <> flag = i == 9
+                out[i] = t_arr[i] {if=flag}
             end
             """, seq_dependencies=True)
 
-    knl = lp.add_and_infer_dtypes(knl,
-            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+    knl = lp.set_temporary_scope(knl, "t_arr", "private")
+    if hw_loop:
+        knl = lp.tag_inames(knl, dict(i="g.0"))
 
-    knl = lp.set_temporary_scope(knl, "t_local", "local")
+    result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9])
 
-    ref_knl = knl
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
-    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
+def test_save_of_local_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
+    knl = lp.make_kernel(
+        "{ [i,j]: 0<=i,j<8 }",
+        """
+        for i, j
+            <>t[2*j] = j
+            t[2*j+1] = j
+            ... gbarrier
+            out[i] = t[2*i]
+        end
+        """, seq_dependencies=True)
 
-    # map schedule onto host or device
-    print(knl)
+    knl = lp.set_temporary_scope(knl, "t", "local")
+    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
 
-    cgr = lp.generate_code_v2(knl)
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
 
-    assert len(cgr.device_programs) == 2
 
-    print(cgr.device_code())
-    print(cgr.host_code())
+def test_save_local_multidim_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}",
+            """
+            for i, j, k
+                ... gbarrier
+                <> t_local[k,j] = 1
+                ... gbarrier
+                out[k,i*2+j] = t_local[k,j]
+            end
+            """, seq_dependencies=True)
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
+    knl = lp.set_temporary_scope(knl, "t_local", "local")
+    knl = lp.tag_inames(knl, dict(j="l.0", i="g.0"))
+
+    save_and_reload_temporaries_test(queue, knl, 1, debug)
 
 
 def test_global_temporary(ctx_factory):
@@ -1609,6 +1729,100 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order):
     assert np.array_equal(a, a2)
 
 
+def test_header_extract():
+    knl = lp.make_kernel('{[k]: 0<=k<n}}',
+         """
+         for k
+             T[k] = k**2
+         end
+         """,
+         [lp.GlobalArg('T', shape=(200,), dtype=np.float32),
+         '...'])
+
+    knl = lp.fix_parameters(knl, n=200)
+
+    #test C
+    cknl = knl.copy(target=lp.CTarget())
+    assert str(lp.generate_header(cknl)[0]) == (
+            'void loopy_kernel(float *__restrict__ T);')
+
+    #test CUDA
+    cuknl = knl.copy(target=lp.CudaTarget())
+    assert str(lp.generate_header(cuknl)[0]) == (
+            'extern "C" __global__ void __launch_bounds__(1) '
+            'loopy_kernel(float *__restrict__ T);')
+
+    #test OpenCL
+    oclknl = knl.copy(target=lp.PyOpenCLTarget())
+    assert str(lp.generate_header(oclknl)[0]) == (
+            '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
+            'loopy_kernel(__global float *__restrict__ T);')
+
+
+def test_scalars_with_base_storage(ctx_factory):
+    """ Regression test for !50 """
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<1}",
+            "a = 1",
+            [lp.TemporaryVariable("a", dtype=np.float64,
+                                  shape=(), base_storage="base")])
+
+    knl(queue, out_host=True)
+
+
+def test_tight_loop_bounds(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        ["{ [i] : 0 <= i <= 5 }",
+         "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
+        """
+        for i
+          for j
+            out[j] = j
+          end
+        end
+        """,
+        silenced_warnings="write_race(insn)")
+
+    knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
+
+    evt, (out,) = knl(queue, out_host=True)
+
+    assert (out == np.arange(10)).all()
+
+
+def test_tight_loop_bounds_codegen():
+    knl = lp.make_kernel(
+        ["{ [i] : 0 <= i <= 5 }",
+         "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
+        """
+        for i
+          for j
+            out[j] = j
+          end
+        end
+        """,
+        silenced_warnings="write_race(insn)",
+        target=lp.OpenCLTarget())
+
+    knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")
+
+    cgr = lp.generate_code_v2(knl)
+    #print(cgr.device_code())
+
+    for_loop = \
+        "for (int j = " \
+        "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \
+        "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
+
+    assert for_loop in cgr.device_code()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 0c304b7a854579007f57ba204cbff8f440aaf5fc..c85aa80ec92eb0185d30f96b478ae37043c0d7e0 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
 
     if 1:
         print("OPS")
-        op_poly = lp.get_op_poly(hsv)
-        print(lp.stringify_stats_mapping(op_poly))
+        op_map = lp.get_op_map(hsv)
+        print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
-        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
-        print(lp.stringify_stats_mapping(gmem_poly))
+        gmem_map = lp.get_mem_access_map(hsv).to_bytes()
+        print(lp.stringify_stats_mapping(gmem_map))
 
     hsv = lp.set_options(hsv, cl_build_options=[
          "-cl-denorms-are-zero",
diff --git a/test/test_reduction.py b/test/test_reduction.py
index b78509b6318a984d117d00b1a6854d9611db80d1..820c669da494f4d8863d274120cd5c0c7eb4420f 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -214,23 +214,18 @@ def test_local_parallel_reduction(ctx_factory, size):
         lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
 
-@pytest.mark.parametrize("size", [10000])
+@pytest.mark.parametrize("size", [1000])
 def test_global_parallel_reduction(ctx_factory, size):
-    # ctx = ctx_factory()
-    # queue = cl.CommandQueue(ctx)
+    ctx = ctx_factory()
 
     knl = lp.make_kernel(
             "{[i]: 0 <= i < n }",
             """
-            for i
-                <> key = make_uint2(i, 324830944)  {inames=i}
-                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
-                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
-            end
-            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
+            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
+            z[0] = sum(i, i/13)
             """)
 
-    # ref_knl = knl
+    ref_knl = knl
 
     gsize = 128
     knl = lp.split_iname(knl, "i", gsize * 20)
@@ -242,42 +237,52 @@ def test_global_parallel_reduction(ctx_factory, size):
     knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL)
     knl = lp.realize_reduction(knl)
+    knl = lp.add_dependency(
+            knl, "writes:acc_i_outer",
+            "id:red_i_outer_arg_barrier")
 
-    #evt, (z,) = knl(queue, n=size)
-
-    #lp.auto_test_vs_ref(ref_knl, ctx, knl)
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl, parameters={"n": size},
+            print_ref_code=True)
 
 
-@pytest.mark.parametrize("size", [10000])
-def test_global_parallel_reduction_simpler(ctx_factory, size):
+@pytest.mark.parametrize("size", [1000])
+def test_global_mc_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
-    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")
+    import pyopencl.version  # noqa
+    if cl.version.VERSION < (2016, 2):
+        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")
 
     knl = lp.make_kernel(
-            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
+            "{[i]: 0 <= i < n }",
             """
-            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
-            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
-            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
-
-            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)
-
-            result = sum(j, tmp[j])
+            for i
+                <> key = make_uint2(i, 324830944)  {inames=i}
+                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
+                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
+            end
+            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
             """)
 
-    ng = 50
-    knl = lp.fix_parameters(knl, ng=ng)
-
-    knl = lp.set_options(knl, write_cl=True)
-
     ref_knl = knl
 
-    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
-    knl = lp.split_reduction_outward(knl, "l_inner")
-    knl = lp.tag_inames(knl, "g:g.0,j:l.0")
+    gsize = 128
+    knl = lp.split_iname(knl, "i", gsize * 20)
+    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
+    knl = lp.split_reduction_inward(knl, "i_inner_inner")
+    knl = lp.split_reduction_inward(knl, "i_inner_outer")
+    from loopy.transform.data import reduction_arg_to_subst_rule
+    knl = reduction_arg_to_subst_rule(knl, "i_outer")
+    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
+            temporary_scope=lp.temp_var_scope.GLOBAL)
+    knl = lp.realize_reduction(knl)
+    knl = lp.add_dependency(
+            knl, "writes:acc_i_outer",
+            "id:red_i_outer_arg_barrier")
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl, parameters={"n": size})
 
 
 def test_argmax(ctx_factory):
@@ -388,112 +393,6 @@ def test_double_sum_made_unique(ctx_factory):
     assert b.get() == ref
 
 
-def test_fd_demo():
-    knl = lp.make_kernel(
-        "{[i,j]: 0<=i,j<n}",
-        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
-                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
-                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
-    #assumptions="n mod 16=0")
-    knl = lp.split_iname(knl,
-            "i", 16, outer_tag="g.1", inner_tag="l.1")
-    knl = lp.split_iname(knl,
-            "j", 16, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.add_prefetch(knl, "u",
-            ["i_inner", "j_inner"],
-            fetch_bounding_box=True)
-
-    #n = 1000
-    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)
-
-    knl = lp.set_options(knl, write_cl=True)
-    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
-    code, inf = lp.generate_code(knl)
-    print(code)
-
-    assert "double" not in code
-
-
-def test_fd_1d(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i]: 0<=i<n}",
-        "result[i] = u[i+1]-u[i]")
-
-    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
-    ref_knl = knl
-
-    knl = lp.split_iname(knl, "i", 16)
-    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
-    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
-    knl = lp.assume(knl, "n mod 16 = 0")
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=2048))
-
-
-def test_poisson_fem(ctx_factory):
-    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
-    ctx = ctx_factory()
-
-    nbf = 5
-    nqp = 5
-    sdim = 3
-
-    knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2,ell3]: \
-            0 <= c < nels and \
-            0 <= i < nbf and \
-            0 <= j < nbf and \
-            0 <= k < nqp and \
-            0 <= ell,ell2 < sdim}",
-            """
-            dpsi(bf,k0,dir) := \
-                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
-            Ael[c,i,j] = \
-                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
-            """,
-            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
-
-    print(knl)
-
-    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
-
-    ref_knl = knl
-
-    knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])
-
-    def variant_1(knl):
-        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
-        knl = lp.prioritize_loops(knl, "c,i,j")
-        return knl
-
-    def variant_2(knl):
-        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
-        knl = lp.prioritize_loops(knl, "c,i,j")
-        return knl
-
-    def add_types(knl):
-        return lp.add_and_infer_dtypes(knl, dict(
-            w=np.float32,
-            J=np.float32,
-            DPsi=np.float32,
-            DFinv=np.float32,
-            ))
-
-    for variant in [
-            #variant_1,
-            variant_2
-            ]:
-        knl = variant(knl)
-
-        lp.auto_test_vs_ref(
-                add_types(ref_knl), ctx, add_types(knl),
-                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
-
-
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 68be5b8a260858e058619c796b3836611c8d4f0f..fb502045c7b6b2c7e02d11ad3ebda3b5d13c8bda 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,8 +28,10 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
+from loopy.types import to_loopy_type
 import numpy as np
 
+from pymbolic.primitives import Variable
 
 def test_op_counter_basic():
 
@@ -44,21 +46,22 @@ def test_op_counter_basic():
             name="basic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
-
+    
 
 def test_op_counter_reduction():
 
@@ -70,15 +73,19 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    assert f32 == f32add + f32mul
+
 
 def test_op_counter_logic():
 
@@ -92,15 +99,15 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -120,24 +127,25 @@ def test_op_counter_specialops():
             name="specialops", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
-    assert f64pow == i32add == f64rsqrt == f64sin == n*m
+    assert f64pow == i32add == f64rsq == f64sin == n*m
 
 
 def test_op_counter_bitwise():
@@ -157,17 +165,17 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
-    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -196,9 +204,9 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
-    flops = poly.eval_with_dict(value_dict)
+    flops = op_map.eval_with_dict(value_dict)
 
     if expect_fallback:
         assert flops == 144
@@ -206,7 +214,7 @@ def test_op_counter_triangular_domain():
         assert flops == 78
 
 
-def test_gmem_access_counter_basic():
+def test_mem_access_counter_basic():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -220,31 +228,37 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                   ].eval_with_dict(params)
-    assert f32 == 3*n*m*l
-    assert f64 == 2*n*m
-
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                   ].eval_with_dict(params)
-    assert f32 == n*m*l
-    assert f64 == n*m
-
-
-def test_gmem_access_counter_reduction():
+    f32l = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32l += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64l = mem_map[lp.MemAccess('global', np.float64,
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64l += mem_map[lp.MemAccess('global', np.float64,
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
+    assert f32l == 3*n*m*l
+    assert f64l == 2*n*m
+
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
+    assert f32s == n*m*l
+    assert f64s == n*m
+
+
+def test_mem_access_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -254,23 +268,33 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*l
+    f32l = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32l += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    assert f32l == 2*n*m*l
+
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    assert f32s == n*l
 
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    assert f32 == n*l
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*f32l
+    assert st_bytes == 4*f32s
 
 
-def test_gmem_access_counter_logic():
+def test_mem_access_counter_logic():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -282,27 +306,29 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    assert f32 == 2*n*m
-    assert f64 == n*m
 
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    assert f64 == n*m
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='store')
+                         ].eval_with_dict(params)
+    assert f32_g_l == 2*n*m
+    assert f64_g_l == n*m
+    assert f64_g_s == n*m
 
 
-def test_gmem_access_counter_specialops():
+def test_mem_access_counter_specialops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -314,33 +340,43 @@ def test_gmem_access_counter_specialops():
             ],
             name="specialops", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
+    f32 = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32 += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
+    f32 = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64 = mem_map[lp.MemAccess('global', np.float64,
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
+    filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g'])
+    #tot = lp.eval_and_sum_polys(filtered_map, params)
+    tot = filtered_map.eval_and_sum(params)
+    assert tot == n*m*l + n*m
 
-def test_gmem_access_counter_bitwise():
+def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -357,23 +393,35 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'load')
-                    ].eval_with_dict(params)
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='load', variable='g')
+               ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'store')
-                    ].eval_with_dict(params)
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='store', variable='e')
+               ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
 
-def test_gmem_access_counter_mixed():
+def test_mem_access_counter_mixed():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -391,35 +439,44 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f32uniform = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+                                stride=0, direction='load', variable='g')
+                     ].eval_with_dict(params)
+    f64uniform += mem_map[lp.MemAccess('global', np.float64, 
+                                 stride=0, direction='load', variable='h')
+                      ].eval_with_dict(params)
+    f32uniform = mem_map[lp.MemAccess('global', np.float32, 
+                                stride=0, direction='load', variable='x')
+                     ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m'), direction='load',
+                                  variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m'), direction='load',
+                                   variable='b')
+                        ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+                                stride=0, direction='store', variable='e')
+                     ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+                                  stride=Variable('m'), direction='store',
+                                  variable='c')
+                       ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_nonconsec():
+def test_mem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -435,31 +492,43 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+                                  stride=Variable('m'), direction='load',
+                                  variable='g')
+                       ].eval_with_dict(params)
+    f64nonconsec += mem_map[lp.MemAccess('global', np.float64, 
+                                   stride=Variable('m'), direction='load',
+                                   variable='h')
+                        ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='load', variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m')*Variable('l'),
+                                   direction='load', variable='b')
+                        ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+                                  stride=Variable('m'), direction='store',
+                                  variable='e')
+                       ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='store', variable='c')
+                       ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_consec():
+def test_mem_access_counter_consec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -474,27 +543,36 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'load')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'load')
-                    ].eval_with_dict(params)
+    #for k in mem_map:
+    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k])
+
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='load', variable='g')
+                     ].eval_with_dict(params)
+    f64consec += mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='load', variable='h')
+                     ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='a')
+                     ].eval_with_dict(params)
+    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='b')
+                     ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'store')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'store')
-                    ].eval_with_dict(params)
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='store', variable='e')
+                     ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='store', variable='c')
+                     ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
 
@@ -511,15 +589,15 @@ def test_barrier_counter_nobarriers():
             ],
             name="basic", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    sync_poly = lp.get_synchronization_poly(knl)
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
+    sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    assert len(sync_poly) == 1
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
+    assert len(sync_map) == 1
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
 
 def test_barrier_counter_barriers():
@@ -539,13 +617,13 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
-    poly = lp.get_synchronization_poly(knl)
-    print(poly)
+    sync_map = lp.get_synchronization_map(knl)
+    print(sync_map)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly["barrier_local"].eval_with_dict(params)
+    barrier_count = sync_map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
 
@@ -560,50 +638,58 @@ def test_all_counters_parallel_matmul():
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    sync_poly = lp.get_synchronization_poly(knl)
-    assert len(sync_poly) == 1
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
+    sync_map = lp.get_synchronization_map(knl)
+    assert len(sync_map) == 2
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
 
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     f32mul = op_map[
-                        (np.dtype(np.float32), 'mul')
+                        lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        (np.dtype(np.float32), 'add')
+                        lp.Op(np.float32, 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        (np.dtype(np.int32), 'add')
+                        lp.Op(np.int32, 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        (np.dtype(np.int32), 'mul')
+                        lp.Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
-    assert i32ops == n*m*l*4 + l*n*4
 
-    subscript_map = lp.get_gmem_access_poly(knl)
-    f32uncoal = subscript_map[
-                        (np.dtype(np.float32), 'nonconsecutive', 'load')
-                        ].eval_with_dict(params)
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'load')
-                        ].eval_with_dict(params)
+    op_map = lp.get_mem_access_map(knl)
 
-    assert f32uncoal == n*m*l
-    assert f32coal == n*m*l
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='b')
+                            ].eval_with_dict(params)
+    f32coal += op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='a')
+                            ].eval_with_dict(params)
 
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'store')
-                        ].eval_with_dict(params)
+    assert f32coal == n*m+m*l
+
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='store', variable='c')
+                            ].eval_with_dict(params)
 
     assert f32coal == n*l
 
+    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
+    local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
+                                            direction='load')
+                                 ].eval_with_dict(params)
+    assert local_mem_l == n*m*l*2
 
 def test_gather_access_footprint():
     knl = lp.make_kernel(
@@ -637,6 +723,82 @@ def test_gather_access_footprint_2():
         print(key, count(knl, footprint))
 
 
+def test_summations_and_filters():
+
+    knl = lp.make_kernel(
+            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            [
+                """
+                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
+                e[i, k+1] = -g[i,k]*h[i,k+1]
+                """
+            ],
+            name="basic", assumptions="n,m,l >= 1")
+
+    knl = lp.add_and_infer_dtypes(knl,
+                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    n = 512
+    m = 256
+    l = 128
+    params = {'n': n, 'm': m, 'l': l}
+
+    mem_map = lp.get_mem_access_map(knl)
+
+    loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params)
+    assert loads_a == 2*n*m*l
+
+    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params)
+    assert global_stores == n*m*l + n*m
+
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
+    assert st_bytes == 4*n*m*l + 8*n*m
+
+    # ignore stride and variable names in this map
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
+                         ].eval_with_dict(params)
+    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
+                         ].eval_with_dict(params)
+    assert f32lall == 3*n*m*l
+    assert f64lall == 2*n*m
+
+    op_map = lp.get_op_map(knl)
+    #for k, v in op_map.items():
+    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
+
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
+    assert f32 == n*m*l*3
+    assert f64 == n*m
+    assert i32 == n*m*2
+
+    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
+    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
+    assert addsub_all == n*m*l + n*m*2
+    assert f32ops_all == n*m*l*3
+
+    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
+    assert non_field == 0
+
+    ops_nodtype = op_map.group_by('name')
+    ops_noname = op_map.group_by('dtype')
+    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
+    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    assert mul_all == n*m*l + n*m
+    assert f64ops_all == n*m
+
+    def func_filter(key):
+        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
+               key.direction == 'load'
+    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
+    assert s1f64l == 2*n*m
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])