diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e2805bd9cc8fbb64f63893093e45229d6803cb3..5f7d5a6463715da6564f20827865ae48944784e1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,8 +1,8 @@
 Python 2.7 AMD CPU:
   script:
-  - py_version=2.7
-  - cl_dev=amd_pu
-  - EXTRA_INSTALL="numpy mako"
+  - export PY_EXE=python2.7
+  - export PYOPENCL_TEST=amd:pu
+  - export EXTRA_INSTALL="numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -12,9 +12,10 @@ Python 2.7 AMD CPU:
   - tags
 Python 3.4 AMD CPU:
   script:
-  - py_version=3.4
-  - cl_dev=amd_pu
-  - EXTRA_INSTALL="numpy mako"
+  - export PY_EXE=python3.4
+  - export PYOPENCL_TEST=amd:pu
+  - export EXTRA_INSTALL="numpy mako"
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
diff --git a/contrib/floopy-highlighting/floopy.vim b/contrib/floopy-highlighting/floopy.vim
index 59c5b15a431e3c28072c96afd824b6937f973b56..57c09a652c0cb9141d6764d300ebb3618577b05d 100644
--- a/contrib/floopy-highlighting/floopy.vim
+++ b/contrib/floopy-highlighting/floopy.vim
@@ -7,7 +7,7 @@
 " :set filetype=floopy
 "
 " You may also include a line
-" vim: filetype=pyopencl.python
+" vim: filetype=floopy.python
 " at the end of your file to set the file type automatically.
 "
 " Another option is to include the following in your .vimrc
@@ -16,24 +16,7 @@
 runtime! syntax/fortran.vim
 
 unlet b:current_syntax
-try
-  syntax include @clCode syntax/opencl.vim
-catch
-  syntax include @clCode syntax/c.vim
-endtry
-
-if exists('b:current_syntax')
-  let s:current_syntax=b:current_syntax
-  " Remove current syntax definition, as some syntax files (e.g. cpp.vim)
-  " do nothing if b:current_syntax is defined.
-  unlet b:current_syntax
-endif
-
 syntax include @LoopyPython syntax/python.vim
-try
-  syntax include @LoopyPython after/syntax/python.vim
-catch
-endtry
 
 if exists('s:current_syntax')
   let b:current_syntax=s:current_syntax
@@ -43,6 +26,6 @@ endif
 
 syntax region textSnipLoopyPython
 \ matchgroup=Comment
-\ start='$loopy begin transform' end='$loopy end transform'
+\ start='$loopy begin' end='$loopy end'
 \ containedin=ALL
 \ contains=@LoopyPython
diff --git a/doc/reference.rst b/doc/reference.rst
index 6ed83d43084b5578c817964b13097b4aa78294ef..c7435bbf7f84570b6672ba1bf42eeab9d495ea56 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -356,7 +356,7 @@ TODO: Matching instruction tags
 
 .. automodule:: loopy.context_matching
 
-.. autofunction:: parse_id_match
+.. autofunction:: parse_match
 
 .. autofunction:: parse_stack_match
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index e566365f8d520d33f518bc95cbaa43e5af6e747c..3180b5066823481c01b43f659ec851e3951fa5e4 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -96,7 +96,7 @@ always see loopy's view of a kernel by printing it.
 
 .. doctest::
 
-    >>> print knl
+    >>> print(knl)
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
@@ -248,7 +248,7 @@ call :func:`loopy.generate_code`:
     >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0])
     >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl)
     >>> code, _ = lp.generate_code(typed_knl)
-    >>> print code
+    >>> print(code)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
@@ -316,7 +316,7 @@ that these dependencies show up there, too:
 
 .. doctest::
 
-    >>> print knl
+    >>> print(knl)
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
@@ -444,8 +444,7 @@ around the *j* loop, or the other way around, in the following simple
 zero-fill kernel?
 
 It turns out that Loopy will typically choose a loop nesting for us, but it
-does not like doing so. In this tutorial (where we've turned Loopy's warnings
-into errors), we are told what is wrong in no uncertain terms::
+does not like doing so. Loo.py will react to the following code
 
 .. doctest::
 
@@ -455,14 +454,13 @@ into errors), we are told what is wrong in no uncertain terms::
     ...     a[i,j] = 0
     ...     """)
 
+By saying::
 
-    >>> knl = lp.set_options(knl, "write_cl")
-    >>> evt, (out,) = knl(queue, a=a_mat_dev)
-    Traceback (most recent call last):
-    ...
     LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
 
-This is easily resolved:
+And by picking one of the possible loop orderings at random.
+
+The warning (and the nondeterminism it warns about) is easily resolved:
 
 .. doctest::
 
@@ -476,6 +474,7 @@ ambiguous.
 
 .. doctest::
 
+    >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
@@ -673,7 +672,7 @@ Iname implementation tags are also printed along with the entire kernel:
 
 .. doctest::
 
-    >>> print knl
+    >>> print(knl)
     ---------------------------------------------------------------------------
     ...
     INAME IMPLEMENTATION TAGS:
@@ -723,9 +722,9 @@ those for us:
 .. doctest::
 
     >>> glob, loc = knl.get_grid_sizes()
-    >>> print glob
+    >>> print(glob)
     (Aff("[n] -> { [(floor((127 + n)/128))] }"),)
-    >>> print loc
+    >>> print(loc)
     (Aff("[n] -> { [(128)] }"),)
 
 Note that this functionality returns internal objects and is not really
@@ -850,8 +849,8 @@ variable, as one might do in C or another programming language:
     ...     "{ [i]: 0<=i<n }",
     ...     """
     ...     <float32> a_temp = sin(a[i])
-    ...     out1[i] = a_temp
-    ...     out2[i] = sqrt(1-a_temp*a_temp)
+    ...     out1[i] = a_temp {id=out1}
+    ...     out2[i] = sqrt(1-a_temp*a_temp) {dep=out1}
     ...     """)
 
 The angle brackets ``<>`` denote the creation of a temporary. The name of
@@ -862,6 +861,9 @@ the conventional :mod:`numpy` scalar types (:class:`numpy.int16`,
 :class:`numpy.complex128`) will work. (Yes, :mod:`loopy` supports and
 generates correct code for complex arithmetic.)
 
+(If you're wondering, the dependencies above were added to make the doctest
+produce predictable output.)
+
 The generated code places this variable into what OpenCL calls 'private'
 memory, local to each work item.
 
@@ -877,8 +879,8 @@ memory, local to each work item.
       for (int i = 0; i <= -1 + n; ++i)
       {
         a_temp = sin(a[i]);
-        out2[i] = sqrt(1.0f + -1.0f * a_temp * a_temp);
         out1[i] = a_temp;
+        out2[i] = sqrt(1.0f + -1.0f * a_temp * a_temp);
       }
     }
 
@@ -921,7 +923,7 @@ Consider the following example:
     ...     "{ [i_outer,i_inner, k]:  "
     ...          "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
     ...     """
-    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner]
+    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
     ...     out[16*i_outer + i_inner] = sum(k, a_temp[k])
     ...     """)
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
@@ -952,6 +954,9 @@ it is written in parallel across values of the group-local iname
 *i_inner*. In addition, :mod:`loopy` has emitted a barrier instruction to
 achieve the :ref:`ordering` specified by the instruction dependencies.
 
+(The ``priority=10`` attribute was added to make the output of the test
+deterministic.)
+
 .. note::
 
     It is worth noting that it was not necessary to provide a size for the
diff --git a/loopy/__init__.py b/loopy/__init__.py
index dcd8e27c1eaf02a0f7aee133598b0e2fd34f70b9..58029467c7bb263203bf33430eb306fe6f16bf14 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -152,7 +152,10 @@ class _InameSplitter(RuleAwareIdentityMapper):
     def map_reduction(self, expr, expn_state):
         if (self.split_iname in expr.inames
                 and self.split_iname not in expn_state.arg_context
-                and self.within(expn_state.stack)):
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
             new_inames = list(expr.inames)
             new_inames.remove(self.split_iname)
             new_inames.extend([self.outer_iname, self.inner_iname])
@@ -166,7 +169,10 @@ class _InameSplitter(RuleAwareIdentityMapper):
     def map_variable(self, expr, expn_state):
         if (expr.name == self.split_iname
                 and self.split_iname not in expn_state.arg_context
-                and self.within(expn_state.stack)):
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
             return self.replacement_index
         else:
             return super(_InameSplitter, self).map_variable(expr, expn_state)
@@ -318,7 +324,10 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
         expr_inames = set(expr.inames)
         overlap = (self.join_inames & expr_inames
                 - set(expn_state.arg_context))
-        if overlap and self.within(expn_state.stack):
+        if overlap and self.within(
+                expn_state.kernel,
+                expn_state.instruction,
+                expn_state.stack):
             if overlap != expr_inames:
                 raise LoopyError(
                         "Cannot join inames '%s' if there is a reduction "
@@ -520,7 +529,10 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
     def map_reduction(self, expr, expn_state):
         if (set(expr.inames) & self.old_inames_set
-                and self.within(expn_state.stack)):
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
             new_inames = tuple(
                     self.old_to_new.get(iname, iname)
                     if iname not in expn_state.arg_context
@@ -538,14 +550,17 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
         if (new_name is None
                 or expr.name in expn_state.arg_context
-                or not self.within(expn_state.stack)):
+                or not self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
             return super(_InameDuplicator, self).map_variable(expr, expn_state)
         else:
             from pymbolic import var
             return var(new_name)
 
-    def map_instruction(self, insn):
-        if not self.within(((insn.id, insn.tags),)):
+    def map_instruction(self, kernel, insn):
+        if not self.within(kernel, insn, ()):
             return insn
 
         new_fid = frozenset(
@@ -1050,7 +1065,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     # If the rule survived past precompute() (i.e. some accesses fell outside
     # the footprint), get rid of it before moving on.
     if rule_name in new_kernel.substitutions:
-        return expand_subst(new_kernel, rule_name)
+        return expand_subst(new_kernel, "id:"+rule_name)
     else:
         return new_kernel
 
@@ -1060,19 +1075,19 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
 # {{{ instruction processing
 
 def find_instructions(kernel, insn_match):
-    from loopy.context_matching import parse_id_match
-    match = parse_id_match(insn_match)
-    return [insn for insn in kernel.instructions if match(insn.id, insn.tags)]
+    from loopy.context_matching import parse_match
+    match = parse_match(insn_match)
+    return [insn for insn in kernel.instructions if match(kernel, insn)]
 
 
 def map_instructions(kernel, insn_match, f):
-    from loopy.context_matching import parse_id_match
-    match = parse_id_match(insn_match)
+    from loopy.context_matching import parse_match
+    match = parse_match(insn_match)
 
     new_insns = []
 
     for insn in kernel.instructions:
-        if match(insn.id, None):
+        if match(kernel, insn):
             new_insns.append(f(insn))
         else:
             new_insns.append(insn)
@@ -1084,7 +1099,7 @@ def set_instruction_priority(kernel, insn_match, priority):
     """Set the priority of instructions matching *insn_match* to *priority*.
 
     *insn_match* may be any instruction id match understood by
-    :func:`loopy.context_matching.parse_id_match`.
+    :func:`loopy.context_matching.parse_match`.
     """
 
     def set_prio(insn):
@@ -1098,7 +1113,7 @@ def add_dependency(kernel, insn_match, dependency):
     by *insn_match*.
 
     *insn_match* may be any instruction id match understood by
-    :func:`loopy.context_matching.parse_id_match`.
+    :func:`loopy.context_matching.parse_match`.
     """
 
     def add_dep(insn):
@@ -1220,7 +1235,11 @@ class _ReductionSplitter(RuleAwareIdentityMapper):
             # FIXME
             raise NotImplementedError()
 
-        if self.inames <= set(expr.inames) and self.within(expn_state.stack):
+        if (self.inames <= set(expr.inames)
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)):
             leftover_inames = set(expr.inames) - self.inames
 
             from loopy.symbolic import Reduction
@@ -1659,10 +1678,12 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
     var_name_gen = kernel.get_var_name_generator()
 
     from pymbolic.mapper.substitutor import make_subst_func
+    from loopy.context_matching import parse_stack_match
+
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, var_name_gen)
     old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context,
-            make_subst_func(subst_dict), within=lambda stack: True)
+            make_subst_func(subst_dict), within=parse_stack_match(None))
 
     kernel = (
             rule_mapping_context.finish_kernel(
@@ -1792,12 +1813,12 @@ def fold_constants(kernel):
 # {{{ tag_instructions
 
 def tag_instructions(kernel, new_tag, within=None):
-    from loopy.context_matching import parse_stack_match
-    within = parse_stack_match(within)
+    from loopy.context_matching import parse_match
+    within = parse_match(within)
 
     new_insns = []
     for insn in kernel.instructions:
-        if within(((insn.id, insn.tags),)):
+        if within(kernel, insn):
             new_insns.append(
                     insn.copy(tags=insn.tags + (new_tag,)))
         else:
diff --git a/loopy/buffer.py b/loopy/buffer.py
index d155dba7e852e6a3978c3c92d4f4a7cba29bc4f0..fdc3774b29f64ba5ae8c465076f48b805836d40b 100644
--- a/loopy/buffer.py
+++ b/loopy/buffer.py
@@ -51,7 +51,10 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
 
     def map_variable(self, expr, expn_state):
         result = None
-        if expr.name == self.var_name and self.within(expn_state):
+        if expr.name == self.var_name and self.within(
+                expn_state.kernel,
+                expn_state.instruction,
+                expn_state.stack):
             result = self.map_array_access((), expn_state)
 
         if result is None:
@@ -62,7 +65,10 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
 
     def map_subscript(self, expr, expn_state):
         result = None
-        if expr.aggregate.name == self.var_name and self.within(expn_state):
+        if expr.aggregate.name == self.var_name and self.within(
+                expn_state.kernel,
+                expn_state.instruction,
+                expn_state.stack):
             result = self.map_array_access(expr.index, expn_state)
 
         if result is None:
@@ -172,7 +178,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     access_descriptors = []
     for insn in kernel.instructions:
-        if not within((insn.id, insn.tags)):
+        if not within(kernel, insn.id, ()):
             continue
 
         for assignee, index in insn.assignees_and_indices():
diff --git a/loopy/context_matching.py b/loopy/context_matching.py
index b259a0ddd33e20d98bf628db0c993501908bcb36..cc375c43897c0ad5e138d4f020f4e48f9a463451 100644
--- a/loopy/context_matching.py
+++ b/loopy/context_matching.py
@@ -1,9 +1,7 @@
 """Matching functionality for instruction ids and subsitution
 rule invocations stacks."""
 
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,150 +25,328 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from six.moves import range, intern
+
+
 NoneType = type(None)
 
+from pytools.lex import RE
+
+
+def re_from_glob(s):
+    import re
+    from fnmatch import translate
+    return re.compile("^"+translate(s.strip())+"$")
+
+# {{{ parsing
 
-# {{{ id match objects
+# {{{ lexer data
+
+_and = intern("and")
+_or = intern("or")
+_not = intern("not")
+_openpar = intern("openpar")
+_closepar = intern("closepar")
+
+_id = intern("_id")
+_tag = intern("_tag")
+_writes = intern("_writes")
+_reads = intern("_reads")
+_iname = intern("_reads")
+
+_whitespace = intern("_whitespace")
+
+# }}}
 
-class AllMatch(object):
-    def __call__(self, identifier, tag):
+
+_LEX_TABLE = [
+    (_and, RE(r"and\b")),
+    (_or, RE(r"or\b")),
+    (_not, RE(r"not\b")),
+    (_openpar, RE(r"\(")),
+    (_closepar, RE(r"\)")),
+
+    # TERMINALS
+    (_id, RE(r"id:([\w?*]+)")),
+    (_tag, RE(r"tag:([\w?*]+)")),
+    (_writes, RE(r"writes:([\w?*]+)")),
+    (_reads, RE(r"reads:([\w?*]+)")),
+    (_iname, RE(r"iname:([\w?*]+)")),
+
+    (_whitespace, RE("[ \t]+")),
+    ]
+
+
+_TERMINALS = ([_id, _tag, _writes, _reads, _iname])
+
+# {{{ operator precedence
+
+_PREC_OR = 10
+_PREC_AND = 20
+_PREC_NOT = 30
+
+# }}}
+
+
+# {{{ match expression
+
+class MatchExpressionBase(object):
+    def __call__(self, kernel, matchable):
+        raise NotImplementedError
+
+
+class AllMatchExpression(MatchExpressionBase):
+    def __call__(self, kernel, matchable):
         return True
 
 
-class RegexIdentifierMatch(object):
-    def __init__(self, id_re, tag_re=None):
-        self.id_re = id_re
-        self.tag_re = tag_re
+class AndMatchExpression(MatchExpressionBase):
+    def __init__(self, children):
+        self.children = children
+
+    def __call__(self, kernel, matchable):
+        return all(ch(kernel, matchable) for ch in self.children)
+
+    def __str__(self):
+        return "(%s)" % (" and ".join(str(ch) for ch in self.children))
+
+
+class OrMatchExpression(MatchExpressionBase):
+    def __init__(self, children):
+        self.children = children
 
-    def __call__(self, identifier, tags):
-        assert isinstance(tags, (tuple, NoneType))
+    def __call__(self, kernel, matchable):
+        return any(ch(kernel, matchable) for ch in self.children)
 
-        if self.tag_re is None:
-            return self.id_re.match(identifier) is not None
+    def __str__(self):
+        return "(%s)" % (" or ".join(str(ch) for ch in self.children))
+
+
+class NotMatchExpression(MatchExpressionBase):
+    def __init__(self, child):
+        self.child = child
+
+    def __call__(self, kernel, matchable):
+        return not self.child(kernel, matchable)
+
+    def __str__(self):
+        return "(not %s)" % str(self.child)
+
+
+class GlobMatchExpressionBase(MatchExpressionBase):
+    def __init__(self, glob):
+        self.glob = glob
+
+        import re
+        from fnmatch import translate
+        self.re = re.compile("^"+translate(glob.strip())+"$")
+
+    def __str__(self):
+        descr = type(self).__name__
+        descr = descr[:descr.find("Match")]
+        return descr.lower() + ":" + self.glob
+
+
+class IdMatchExpression(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return self.re.match(matchable.id)
+
+
+class TagMatchExpression(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        if matchable.tags:
+            return any(self.re.match(tag) for tag in matchable.tags)
         else:
-            if not tags:
-                tags = ("",)
+            return False
+
 
-            return (
-                    self.id_re.match(identifier) is not None
-                    and any(
-                        self.tag_re.match(tag) is not None
-                        for tag in tags))
+class WritesMatchExpression(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return any(self.re.match(name)
+                for name in matchable.write_dependency_names())
 
 
-class AlternativeMatch(object):
-    def __init__(self, matches):
-        self.matches = matches
+class ReadsMatchExpression(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return any(self.re.match(name)
+                for name in matchable.read_dependency_names())
 
-    def __call__(self, identifier, tags):
-        from pytools import any
-        return any(
-                mtch(identifier, tags) for mtch in self.matches)
+
+class InameMatchExpression(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return any(self.re.match(name)
+                for name in matchable.inames(kernel))
 
 # }}}
 
 
-# {{{ single id match parsing
+# {{{ parser
 
-def parse_id_match(id_matches):
+def parse_match(expr_str):
     """Syntax examples::
 
-        my_insn
-        compute_*
-        fetch*$first
-        fetch*$first,store*$first
-
-    Alternatively, a list of *(name_glob, tag_glob)* tuples.
+    * ``id:yoink and writes:a_temp``
+    * ``id:yoink and (not writes:a_temp or tagged:input)``
     """
+    if not expr_str:
+        return AllMatchExpression()
+
+    def parse_terminal(pstate):
+        next_tag = pstate.next_tag()
+        if next_tag is _id:
+            result = IdMatchExpression(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
+        elif next_tag is _tag:
+            result = TagMatchExpression(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
+        elif next_tag is _writes:
+            result = WritesMatchExpression(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
+        elif next_tag is _reads:
+            result = ReadsMatchExpression(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
+        elif next_tag is _iname:
+            result = InameMatchExpression(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
+        else:
+            pstate.expected("terminal")
+
+    def inner_parse(pstate, min_precedence=0):
+        pstate.expect_not_end()
+
+        if pstate.is_next(_not):
+            pstate.advance()
+            left_query = NotMatchExpression(inner_parse(pstate, _PREC_NOT))
+        elif pstate.is_next(_openpar):
+            pstate.advance()
+            left_query = inner_parse(pstate)
+            pstate.expect(_closepar)
+            pstate.advance()
+        else:
+            left_query = parse_terminal(pstate)
 
-    if id_matches is None:
-        return AllMatch()
+        did_something = True
+        while did_something:
+            did_something = False
+            if pstate.is_at_end():
+                return left_query
 
-    if isinstance(id_matches, str):
-        id_matches = id_matches.strip()
-        id_matches = id_matches.split(",")
+            next_tag = pstate.next_tag()
 
-    if len(id_matches) > 1:
-        return AlternativeMatch([
-            parse_id_match(im) for im in id_matches])
+            if next_tag is _and and _PREC_AND > min_precedence:
+                pstate.advance()
+                left_query = AndMatchExpression(
+                        (left_query, inner_parse(pstate, _PREC_AND)))
+                did_something = True
+            elif next_tag is _or and _PREC_OR > min_precedence:
+                pstate.advance()
+                left_query = OrMatchExpression(
+                        (left_query, inner_parse(pstate, _PREC_OR)))
+                did_something = True
 
-    if len(id_matches) == 0:
-        return AllMatch()
+        return left_query
 
-    id_match, = id_matches
-    del id_matches
+    from pytools.lex import LexIterator, lex
+    pstate = LexIterator(
+        [(tag, s, idx, matchobj)
+         for (tag, s, idx, matchobj) in lex(_LEX_TABLE, expr_str, match_objects=True)
+         if tag is not _whitespace], expr_str)
 
-    def re_from_glob(s):
-        import re
-        from fnmatch import translate
-        return re.compile("^"+translate(s.strip())+"$")
+    if pstate.is_at_end():
+        pstate.raise_parse_error("unexpected end of input")
 
-    if not isinstance(id_match, tuple):
-        components = id_match.split("$")
+    result = inner_parse(pstate)
+    if not pstate.is_at_end():
+        pstate.raise_parse_error("leftover input after completed parse")
 
-    if len(components) == 1:
-        return RegexIdentifierMatch(re_from_glob(components[0]))
-    elif len(components) == 2:
-        return RegexIdentifierMatch(
-                re_from_glob(components[0]),
-                re_from_glob(components[1]))
-    else:
-        raise RuntimeError("too many (%d) $-separated components in id match"
-                % len(components))
+    return result
 
 # }}}
 
+# }}}
 
-# {{{ stack match objects
 
-# these match from the tail of the stack
+# {{{ stack match objects
 
-class StackMatchBase(object):
+class StackMatchComponent(object):
     pass
 
 
-class AllStackMatch(StackMatchBase):
-    def __call__(self, stack):
+class StackAllMatchComponent(StackMatchComponent):
+    def __call__(self, kernel, stack):
         return True
 
 
-class StackIdMatch(StackMatchBase):
-    def __init__(self, id_match, up_match):
-        self.id_match = id_match
-        self.up_match = up_match
+class StackBottomMatchComponent(StackMatchComponent):
+    def __call__(self, kernel, stack):
+        return not stack
+
+
+class StackItemMatchComponent(StackMatchComponent):
+    def __init__(self, match_expr, inner_match):
+        self.match_expr = match_expr
+        self.inner_match = inner_match
 
-    def __call__(self, stack):
+    def __call__(self, kernel, stack):
         if not stack:
             return False
 
-        last = stack[-1]
-        if not self.id_match(*last):
+        outer = stack[0]
+        if not self.match_expr(kernel, outer):
             return False
 
-        if self.up_match is None:
-            return True
-        else:
-            return self.up_match(stack[:-1])
+        return self.inner_match(kernel, stack[1:])
 
 
-class StackWildcardMatch(StackMatchBase):
-    def __init__(self, up_match):
-        self.up_match = up_match
+class StackWildcardMatchComponent(StackMatchComponent):
+    def __init__(self, inner_match):
+        self.inner_match = inner_match
 
-    def __call__(self, stack):
-        if self.up_match is None:
-            return True
+    def __call__(self, kernel, stack):
+        for i in range(0, len(stack)):
+            if self.inner_match(kernel, stack[i:]):
+                return True
 
-        n = len(stack)
+        return False
 
-        if self.up_match(stack):
-            return True
+# }}}
 
-        for i in range(1, n):
-            if self.up_match(stack[:-i]):
-                return True
 
-        return False
+# {{{ stack matcher
+
+class RuleInvocationMatchable(object):
+    def __init__(self, id, tags):
+        self.id = id
+        self.tags = tags
+
+    def write_dependency_names(self):
+        raise TypeError("writes: query may not be applied to rule invocations")
+
+    def read_dependency_names(self):
+        raise TypeError("reads: query may not be applied to rule invocations")
+
+    def inames(self, kernel):
+        raise TypeError("inames: query may not be applied to rule invocations")
+
+
+class StackMatch(object):
+    def __init__(self, root_component):
+        self.root_component = root_component
+
+    def __call__(self, kernel, insn, rule_stack):
+        """
+        :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first
+        """
+        stack_of_matchables = [insn]
+        for id, tags in rule_stack:
+            stack_of_matchables.append(RuleInvocationMatchable(id, tags))
+
+        return self.root_component(kernel, stack_of_matchables)
 
 # }}}
 
@@ -180,34 +356,41 @@ class StackWildcardMatch(StackMatchBase):
 def parse_stack_match(smatch):
     """Syntax example::
 
-        lowest < next < ... < highest
+        ... > outer > ... > next > innermost $
+        insn > next
+        insn > ... > next > innermost $
 
-    where `lowest` is necessarily the bottom of the stack.  ``...`` matches an
-    arbitrary number of intervening stack levels. There is currently no way to
-    match the top of the stack.
+    ``...`` matches an arbitrary number of intervening stack levels.
 
-    Each of the entries is an identifier match as understood by
-    :func:`parse_id_match`.
+    Each of the entries is a match expression as understood by
+    :func:`parse_match`.
     """
 
-    if isinstance(smatch, StackMatchBase):
+    if isinstance(smatch, StackMatch):
         return smatch
 
-    match = AllStackMatch()
-
     if smatch is None:
-        return match
+        return StackMatch(StackAllMatchComponent())
+
+    smatch = smatch.strip()
+
+    match = StackAllMatchComponent()
+    if smatch[-1] == "$":
+        match = StackBottomMatchComponent()
+        smatch = smatch[:-1]
+
+    smatch = smatch.strip()
 
-    components = smatch.split("<")
+    components = smatch.split(">")
 
     for comp in components[::-1]:
         comp = comp.strip()
         if comp == "...":
-            match = StackWildcardMatch(match)
+            match = StackWildcardMatchComponent(match)
         else:
-            match = StackIdMatch(parse_id_match(comp), match)
+            match = StackItemMatchComponent(parse_match(comp), match)
 
-    return match
+    return StackMatch(match)
 
 # }}}
 
diff --git a/loopy/expression.py b/loopy/expression.py
index 2afb803b97b0e5b9f9ff1510da92ad10a50711dc..d912d528a7cc6a567db5aca684103eb736737030 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -1,4 +1,4 @@
-from __future__ import division, absolute_import
+from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner"
 
@@ -79,9 +79,9 @@ class TypeInferenceMapper(CombineMapper):
         while dtypes:
             other = dtypes.pop()
 
-            if result.isbuiltin and other.isbuiltin:
+            if result.fields is None and other.fields is None:
                 if (result, other) in [
-                        (np.int32, np.float32), (np.int32, np.float32)]:
+                        (np.int32, np.float32), (np.float32, np.int32)]:
                     # numpy makes this a double. I disagree.
                     result = np.dtype(np.float32)
                 else:
@@ -89,11 +89,14 @@ class TypeInferenceMapper(CombineMapper):
                             np.empty(0, dtype=result)
                             + np.empty(0, dtype=other)
                             ).dtype
-            elif result.isbuiltin and not other.isbuiltin:
+
+            elif result.fields is None and other.fields is not None:
                 # assume the non-native type takes over
+                # (This is used for vector types.)
                 result = other
-            elif not result.isbuiltin and other.isbuiltin:
+            elif result.fields is not None and other.fields is None:
                 # assume the non-native type takes over
+                # (This is used for vector types.)
                 pass
             else:
                 if result is not other:
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 1c3d1283541c33ce3944e358f4abd451e2c881ee..aad2328b0bfbf4b37c6c6453358668d14fd39388 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -215,6 +215,13 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True)
     """
     :returns: a list of :class:`loopy.LoopKernel` objects
     """
+    import logging
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('fparser').addHandler(console)
+
     from fparser import api
     tree = api.parse(source, isfree=free_form, isstrict=strict,
             analyze=False, ignore_comments=False)
diff --git a/loopy/fusion.py b/loopy/fusion.py
index 4431c2c7f56009b8e205cb310060b96cfa210eae..c14d936afb4ff063bad9e9ff7e1189daadf15a5c 100644
--- a/loopy/fusion.py
+++ b/loopy/fusion.py
@@ -170,11 +170,13 @@ def _fuse_two_kernels(knla, knlb):
             SubstitutionRuleMappingContext,
             RuleAwareSubstitutionMapper)
     from pymbolic.mapper.substitutor import make_subst_func
+    from loopy.context_matching import parse_stack_match
 
     srmc = SubstitutionRuleMappingContext(
             knlb.substitutions, knlb.get_var_name_generator())
     subst_map = RuleAwareSubstitutionMapper(
-            srmc, make_subst_func(b_var_renames), within=lambda stack: True)
+            srmc, make_subst_func(b_var_renames),
+            within=parse_stack_match(None))
     knlb = subst_map.map_kernel(knlb)
 
     # }}}
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 1f64e59e6c9e8ab6d599ea0cc61937df5a60ffcb..f930df60c93b1be4df80bcfd2789c60dca1e3654 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -902,43 +902,54 @@ class LoopKernel(RecordWithoutPickling):
     def __str__(self):
         lines = []
 
+        from loopy.preprocess import add_default_dependencies
+        kernel = add_default_dependencies(self)
+
         sep = 75*"-"
         lines.append(sep)
-        lines.append("KERNEL: " + self.name)
+        lines.append("KERNEL: " + kernel.name)
         lines.append(sep)
         lines.append("ARGUMENTS:")
-        for arg_name in sorted(self.arg_dict):
-            lines.append(str(self.arg_dict[arg_name]))
+        for arg_name in sorted(kernel.arg_dict):
+            lines.append(str(kernel.arg_dict[arg_name]))
         lines.append(sep)
         lines.append("DOMAINS:")
-        for dom, parents in zip(self.domains, self.all_parents_per_domain()):
+        for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()):
             lines.append(len(parents)*"  " + str(dom))
 
         lines.append(sep)
         lines.append("INAME IMPLEMENTATION TAGS:")
-        for iname in sorted(self.all_inames()):
-            line = "%s: %s" % (iname, self.iname_to_tag.get(iname))
+        for iname in sorted(kernel.all_inames()):
+            line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname))
             lines.append(line)
 
-        if self.temporary_variables:
+        if kernel.temporary_variables:
             lines.append(sep)
             lines.append("TEMPORARIES:")
-            for tv in sorted(six.itervalues(self.temporary_variables),
+            for tv in sorted(six.itervalues(kernel.temporary_variables),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
-        if self.substitutions:
+        if kernel.substitutions:
             lines.append(sep)
             lines.append("SUBSTIUTION RULES:")
-            for rule_name in sorted(six.iterkeys(self.substitutions)):
-                lines.append(str(self.substitutions[rule_name]))
+            for rule_name in sorted(six.iterkeys(kernel.substitutions)):
+                lines.append(str(kernel.substitutions[rule_name]))
 
         lines.append(sep)
         lines.append("INSTRUCTIONS:")
         loop_list_width = 35
 
-        import loopy as lp
-        for insn in self.instructions:
+        printed_insn_ids = set()
+
+        def print_insn(insn):
+            if insn.id in printed_insn_ids:
+                return
+            printed_insn_ids.add(insn.id)
+
+            for dep_id in insn.insn_deps:
+                print_insn(kernel.id_to_insn[dep_id])
+
             if isinstance(insn, lp.ExpressionInstruction):
                 lhs = str(insn.assignee)
                 rhs = str(insn.expression)
@@ -952,7 +963,7 @@ class LoopKernel(RecordWithoutPickling):
 
                 trailing = ["    "+l for l in insn.code.split("\n")]
 
-            loop_list = ",".join(sorted(self.insn_inames(insn)))
+            loop_list = ",".join(sorted(kernel.insn_inames(insn)))
 
             options = [insn.id]
             if insn.priority:
@@ -975,8 +986,12 @@ class LoopKernel(RecordWithoutPickling):
             if insn.predicates:
                 lines.append(10*" " + "if (%s)" % " && ".join(insn.predicates))
 
+        import loopy as lp
+        for insn in kernel.instructions:
+            print_insn(insn)
+
         dep_lines = []
-        for insn in self.instructions:
+        for insn in kernel.instructions:
             if insn.insn_deps:
                 dep_lines.append("%s : %s" % (insn.id, ",".join(insn.insn_deps)))
         if dep_lines:
@@ -987,10 +1002,10 @@ class LoopKernel(RecordWithoutPickling):
 
         lines.append(sep)
 
-        if self.schedule is not None:
+        if kernel.schedule is not None:
             lines.append("SCHEDULE:")
             from loopy.schedule import dump_schedule
-            lines.append(dump_schedule(self.schedule))
+            lines.append(dump_schedule(kernel.schedule))
             lines.append(sep)
 
         return "\n".join(lines)
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 7edf43e43bceb6ee3622f3645ad37c4375a703e0..1daa041c54a17214b6531751cf206878e6b1e676 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1,10 +1,6 @@
 """UI for kernel creation."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -36,6 +32,9 @@ from loopy.kernel.data import (
 import islpy as isl
 from islpy import dim_type
 
+import six
+from six.moves import range, zip
+
 import re
 import sys
 
@@ -904,9 +903,10 @@ def guess_arg_shape_if_requested(kernel, default_order):
                                         armap.access_range, i) + 1,
                                     constants_only=False)))
                     except:
-                        print>>sys.stderr, "While trying to find shape axis %d of "\
-                                "argument '%s', the following " \
-                                "exception occurred:" % (i, arg.name)
+                        print("While trying to find shape axis %d of "
+                                "argument '%s', the following "
+                                "exception occurred:" % (i, arg.name),
+                                file=sys.stderr)
                         raise
 
                 shape = tuple(shape)
diff --git a/loopy/precompute.py b/loopy/precompute.py
index 935d6d44040cf56b3ca3bdbbec26e22ec750a05d..1227082a9b304ad920940e2339e643ad2da65daa 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -81,7 +81,10 @@ class RuleInvocationGatherer(RuleAwareIdentityMapper):
         if self.subst_tag is not None and self.subst_tag != tag:
             process_me = False
 
-        process_me = process_me and self.within(expn_state.stack)
+        process_me = process_me and self.within(
+                expn_state.kernel,
+                expn_state.instruction,
+                expn_state.stack)
 
         if not process_me:
             return super(RuleInvocationGatherer, self).map_substitution(
@@ -132,7 +135,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
             access_descriptors, array_base_map,
             storage_axis_names, storage_axis_sources,
             non1_storage_axis_names,
-            target_var_name):
+            temporary_name):
         super(RuleInvocationReplacer, self).__init__(rule_mapping_context)
 
         self.subst_name = subst_name
@@ -146,12 +149,15 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
         self.storage_axis_sources = storage_axis_sources
         self.non1_storage_axis_names = non1_storage_axis_names
 
-        self.target_var_name = target_var_name
+        self.temporary_name = temporary_name
 
     def map_substitution(self, name, tag, arguments, expn_state):
         if not (
                 name == self.subst_name
-                and self.within(expn_state.stack)
+                and self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)
                 and (self.subst_tag is None or self.subst_tag == tag)):
             return super(RuleInvocationReplacer, self).map_substitution(
                     name, tag, arguments, expn_state)
@@ -196,7 +202,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
             ax_index = simplify_via_aff(ax_index - sax_base_idx)
             stor_subscript.append(ax_index)
 
-        new_outer_expr = var(self.target_var_name)
+        new_outer_expr = var(self.temporary_name)
         if stor_subscript:
             new_outer_expr = new_outer_expr.index(tuple(stor_subscript))
 
@@ -210,9 +216,9 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
-        storage_axes=None, precompute_inames=None, storage_axis_to_tag={},
-        default_tag="l.auto", dtype=None, fetch_bounding_box=False,
-        temporary_is_local=None):
+        storage_axes=None, temporary_name=None, precompute_inames=None,
+        storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
+        fetch_bounding_box=False, temporary_is_local=None):
     """Precompute the expression described in the substitution rule determined by
     *subst_use* and store it in a temporary array. A precomputation needs two
     things to operate, a list of *sweep_inames* (order irrelevant) and an
@@ -263,6 +269,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         May also equivalently be a comma-separated string.
     :arg within: a stack match as understood by
         :func:`loopy.context_matching.parse_stack_match`.
+    :arg temporary_name:
+        The temporary variable name to use for storing the precomputed data.
+        If it does not exist, it will be created. If it does exist, its properties
+        (such as size, type) are checked (and updated, if possible) to match
+        its use.
     :arg precompute_inames:
         If the specified inames do not already exist, they will be
         created. If they do already exist, their loop domain is verified
@@ -382,8 +393,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         import loopy as lp
         for insn in kernel.instructions:
             if isinstance(insn, lp.ExpressionInstruction):
-                invg(insn.assignee, insn.id, insn.tags)
-                invg(insn.expression, insn.id, insn.tags)
+                invg(insn.assignee, kernel, insn)
+                invg(insn.expression, kernel, insn)
 
         access_descriptors = invg.access_descriptors
         if not access_descriptors:
@@ -584,8 +595,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ set up compute insn
 
-    target_var_name = var_name_gen(based_on=c_subst_name)
-    assignee = var(target_var_name)
+    if temporary_name is None:
+        temporary_name = var_name_gen(based_on=c_subst_name)
+
+    assignee = var(temporary_name)
 
     if non1_storage_axis_names:
         assignee = assignee.index(
@@ -607,13 +620,13 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
 
-    from loopy.context_matching import AllStackMatch
+    from loopy.context_matching import parse_stack_match
     expr_subst_map = RuleAwareSubstitutionMapper(
             rule_mapping_context,
             make_subst_func(storage_axis_subst_dict),
-            within=AllStackMatch())
+            within=parse_stack_match(None))
 
-    compute_expression = expr_subst_map(subst.expression, None, None)
+    compute_expression = expr_subst_map(subst.expression, kernel, None)
 
     # }}}
 
@@ -633,7 +646,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             access_descriptors, abm,
             storage_axis_names, storage_axis_sources,
             non1_storage_axis_names,
-            target_var_name)
+            temporary_name)
 
     kernel = invr.map_kernel(kernel)
     kernel = kernel.copy(
@@ -655,15 +668,69 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     if temporary_is_local is None:
         temporary_is_local = lp.auto
 
+    new_temp_shape = tuple(abm.non1_storage_shape)
+
     new_temporary_variables = kernel.temporary_variables.copy()
-    temp_var = lp.TemporaryVariable(
-            name=target_var_name,
-            dtype=dtype,
-            base_indices=(0,)*len(abm.non1_storage_shape),
-            shape=tuple(abm.non1_storage_shape),
-            is_local=temporary_is_local)
-
-    new_temporary_variables[target_var_name] = temp_var
+    if temporary_name not in new_temporary_variables:
+        temp_var = lp.TemporaryVariable(
+                name=temporary_name,
+                dtype=dtype,
+                base_indices=(0,)*len(new_temp_shape),
+                shape=tuple(abm.non1_storage_shape),
+                is_local=temporary_is_local)
+
+    else:
+        temp_var = new_temporary_variables[temporary_name]
+
+        # {{{ check and adapt existing temporary
+
+        if temp_var.dtype is lp.auto:
+            pass
+        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
+            dtype = temp_var.dtype
+        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
+            if temp_var.dtype != dtype:
+                raise LoopyError("Existing and new dtype of temporary '%s' "
+                        "do not match (existing: %s, new: %s)"
+                        % (temporary_name, temp_var.dtype, dtype))
+
+        temp_var = temp_var.copy(dtype=dtype)
+
+        if len(temp_var.shape) != len(new_temp_shape):
+            raise LoopyError("Existing and new temporary '%s' do not "
+                    "have matching number of dimensions "
+                    % (temporary_name,
+                        len(temp_var.shape), len(new_temp_shape)))
+
+        if temp_var.base_indices != (0,) * len(new_temp_shape):
+            raise LoopyError("Existing and new temporary '%s' do not "
+                    "have matching number of dimensions "
+                    % (temporary_name,
+                        len(temp_var.shape), len(new_temp_shape)))
+
+        new_temp_shape = tuple(
+                max(i, ex_i)
+                for i, ex_i in zip(new_temp_shape, temp_var.shape))
+
+        temp_var = temp_var.copy(shape=new_temp_shape)
+
+        if temporary_is_local == temp_var.is_local:
+            pass
+        elif temporary_is_local is lp.auto:
+            temporary_is_local = temp_var.is_local
+        elif temp_var.is_local is lp.auto:
+            pass
+        else:
+            raise LoopyError("Existing and new temporary '%s' do not "
+                    "have matching values of 'is_local'"
+                    % (temporary_name,
+                        temp_var.is_local, temporary_is_local))
+
+        temp_var = temp_var.copy(is_local=temporary_is_local)
+
+        # }}}
+
+    new_temporary_variables[temporary_name] = temp_var
 
     kernel = kernel.copy(
             temporary_variables=new_temporary_variables)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index efe2f1d0eabb1133dc29d2a2b7592b95f3a260be..2fc5b37fa49acbfe3ff5f1672937369b8b8884a5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1,4 +1,4 @@
-from __future__ import division, absolute_import
+from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2015 James Stevens"
 
@@ -73,9 +73,6 @@ class TypeToOpCountMap:
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
-    def __setitem__(self, index, value):
-        self.dict[index] = value
-
     def __str__(self):
         return str(self.dict)
 
diff --git a/loopy/subst.py b/loopy/subst.py
index 3b112a4fd44880f6cd66019aff1023ee426ae125..a0a031718962df3053b80058818b2f2a4b88d2c8 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -258,7 +258,10 @@ class TemporaryToSubstChanger(RuleAwareIdentityMapper):
 
         my_def_id = self.usage_to_definition[my_insn_id]
 
-        if not self.within(expn_state.stack):
+        if not self.within(
+                expn_state.kernel,
+                expn_state.instruction,
+                expn_state.stack):
             self.saw_unmatched_usage_sites[my_def_id] = True
             return None
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index bad7f840f648bc72d7f505fb72a200f3df2fb0d6..3311d23165b5f34f16c0c745218fd7da3a0ad2ac 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -398,11 +398,13 @@ def parse_tagged_name(expr):
 
 class ExpansionState(Record):
     """
+    .. attribute:: kernel
+    .. attribute:: instruction
+
     .. attribute:: stack
 
         a tuple representing the current expansion stack, as a tuple
-        of (name, tag) pairs. At the top level, this should be initialized to a
-        tuple with the id of the calling instruction.
+        of (name, tag) pairs.
 
     .. attribute:: arg_context
 
@@ -411,7 +413,7 @@ class ExpansionState(Record):
 
     @property
     def insn_id(self):
-        return self.stack[0][0]
+        return self.instruction.id
 
     def apply_arg_context(self, expr):
         from pymbolic.mapper.substitutor import make_subst_func
@@ -625,16 +627,18 @@ class RuleAwareIdentityMapper(IdentityMapper):
         else:
             return sym
 
-    def __call__(self, expr, insn_id, insn_tags):
-        if insn_id is not None:
-            stack = ((insn_id, insn_tags),)
-        else:
-            stack = ()
+    def __call__(self, expr, kernel, insn):
+        from loopy.kernel.data import InstructionBase
+        assert insn is None or isinstance(insn, InstructionBase)
 
-        return IdentityMapper.__call__(self, expr, ExpansionState(
-            stack=stack, arg_context={}))
+        return IdentityMapper.__call__(self, expr,
+                ExpansionState(
+                    kernel=kernel,
+                    instruction=insn,
+                    stack=(),
+                    arg_context={}))
 
-    def map_instruction(self, insn):
+    def map_instruction(self, kernel, insn):
         return insn
 
     def map_kernel(self, kernel):
@@ -642,8 +646,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
                 # While subst rules are not allowed in assignees, the mapper
                 # may perform tasks entirely unrelated to subst rules, so
                 # we must map assignees, too.
-                self.map_instruction(
-                    insn.with_transformed_expressions(self, insn.id, insn.tags))
+                self.map_instruction(kernel,
+                    insn.with_transformed_expressions(self, kernel, insn))
                 for insn in kernel.instructions]
 
         return kernel.copy(instructions=new_insns)
@@ -658,7 +662,8 @@ class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
 
     def map_variable(self, expr, expn_state):
         if (expr.name in expn_state.arg_context
-                or not self.within(expn_state.stack)):
+                or not self.within(
+                    expn_state.kernel, expn_state.instruction, expn_state.stack)):
             return super(RuleAwareSubstitutionMapper, self).map_variable(
                     expr, expn_state)
 
@@ -685,7 +690,7 @@ class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper):
 
         new_stack = expn_state.stack + ((name, tags),)
 
-        if self.within(new_stack):
+        if self.within(expn_state.kernel, expn_state.instruction, new_stack):
             # expand
             rule = self.rules[name]
 
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 1dc3aafcfb63789255a7f5fd79b178e657b3c74d..d54640b5d8e277be32f9314db1453a05f9097a08 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -373,17 +373,17 @@ class LoopyCCodeMapper(RecursiveMapper):
                 # This made it through type 'guessing' above, and it
                 # was concluded above (search for COMPLEX_GUESS_LOGIC),
                 # that nothing was lost by using single precision.
-                cast_type = "cfloat_t"
+                cast_type = "cfloat"
             else:
                 if dtype == np.complex128:
-                    cast_type = "cdouble_t"
+                    cast_type = "cdouble"
                 elif dtype == np.complex64:
-                    cast_type = "cfloat_t"
+                    cast_type = "cfloat"
                 else:
                     raise RuntimeError("unsupported complex type in expression "
                             "generation: %s" % type(expr))
 
-            return "(%s) (%s, %s)" % (cast_type, repr(expr.real), repr(expr.imag))
+            return "%s_new(%s, %s)" % (cast_type, repr(expr.real), repr(expr.imag))
         else:
             if type_context == "f":
                 return repr(float(expr))+"f"
@@ -490,11 +490,20 @@ class LoopyCCodeMapper(RecursiveMapper):
                     if 'c' == self.infer_type(child).kind]
 
             real_sum = self.join_rec(" + ", reals, PREC_SUM, type_context)
-            complex_sum = self.join_rec(
-                    " + ", complexes, PREC_SUM, type_context, tgt_dtype)
+
+            if len(complexes) == 1:
+                myprec = PREC_SUM
+            else:
+                myprec = PREC_NONE
+
+            complex_sum = self.rec(complexes[0], myprec, type_context, tgt_dtype)
+            for child in complexes[1:]:
+                complex_sum = "%s_add(%s, %s)" % (
+                        tgt_name, complex_sum,
+                        self.rec(child, PREC_NONE, type_context, tgt_dtype))
 
             if real_sum:
-                result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum)
+                result = "%s_radd(%s, %s)" % (tgt_name, real_sum, complex_sum)
             else:
                 result = complex_sum
 
@@ -545,8 +554,7 @@ class LoopyCCodeMapper(RecursiveMapper):
                         self.rec(child, PREC_NONE, type_context, tgt_dtype))
 
             if real_prd:
-                # elementwise semantics are correct
-                result = "%s*%s" % (real_prd, complex_prd)
+                result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd)
             else:
                 result = complex_prd
 
@@ -591,9 +599,10 @@ class LoopyCCodeMapper(RecursiveMapper):
         if not (n_complex or d_complex):
             return base_impl(expr, enclosing_prec, type_context)
         elif n_complex and not d_complex:
-            # elementwise semantics are correct
-            return base_impl(expr, enclosing_prec, type_context,
-                    num_tgt_dtype=tgt_dtype)
+            return "%s_divider(%s, %s)" % (
+                    self.complex_type_name(tgt_dtype),
+                    self.rec(expr.numerator, PREC_NONE, type_context, tgt_dtype),
+                    self.rec(expr.denominator, PREC_NONE, type_context))
         elif not n_complex and d_complex:
             return "%s_rdivide(%s, %s)" % (
                     self.complex_type_name(tgt_dtype),
diff --git a/loopy/version.py b/loopy/version.py
index e7d877aa136ef4ad15774388ba9c9cb37b828ed1..71911e9570f3e404c9c2fe64a479f8d97259906d 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -25,4 +25,4 @@ VERSION = (2014, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
-DATA_MODEL_VERSION = "v6"
+DATA_MODEL_VERSION = "v8"
diff --git a/test/test_fortran.py b/test/test_fortran.py
index d361b15dc3d5f280bd9a4dadd4dbd603d4470284..4117b80a27b243dee1db94b5a0bb2b83b2ec8d49 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -267,7 +267,7 @@ def test_tagged(ctx_factory):
 
     knl, = lp.parse_fortran(fortran_src)
 
-    assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2
+    assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2
 
 
 @pytest.mark.parametrize("buffer_inames", [
@@ -408,6 +408,10 @@ def test_fuse_kernels(ctx_factory):
 
     assert len(knl.temporary_variables) == 2
 
+    # This is needed for correctness, otherwise ordering could foul things up.
+    knl = lp.temporary_to_subst(knl, "prev")
+    knl = lp.temporary_to_subst(knl, "prev_0")
+
     ctx = ctx_factory()
     lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 2173347cadaea66a4ec53d78fb8c77abb24cc7bb..1527e4ff710871f4ee7cddde95ad2016d78969a2 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -65,7 +65,7 @@ def test_complicated_subst(ctx_factory):
                 a[i] = h$one(i) * h$two(i)
                 """)
 
-    knl = lp.expand_subst(knl, "g$two < h$two")
+    knl = lp.expand_subst(knl, "... > id:h and tag:two > id:g and tag:two")
 
     print(knl)
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index 33d15f88dab634c0420474d91fdd74efbfae4ae2..a00fce1776dd8d0722e59ae544ce439568a66d5d 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -39,7 +39,7 @@ def test_tim2d(ctx_factory):
     n = 8
 
     from pymbolic import var
-    K_sym = var("K")
+    K_sym = var("K")  # noqa
 
     field_shape = (K_sym, n, n)
 
@@ -70,8 +70,8 @@ def test_tim2d(ctx_factory):
                 ],
             name="semlap2D", assumptions="K>=1")
 
-    knl = lp.duplicate_inames(knl, "o", within="ur")
-    knl = lp.duplicate_inames(knl, "o", within="us")
+    knl = lp.duplicate_inames(knl, "o", within="id:ur")
+    knl = lp.duplicate_inames(knl, "o", within="id:us")
 
     seq_knl = knl
 
@@ -93,13 +93,13 @@ def test_tim2d(ctx_factory):
         knl = lp.tag_inames(knl, dict(o="unr"))
         knl = lp.tag_inames(knl, dict(m="unr"))
 
-        knl = lp.set_instruction_priority(knl, "D_fetch", 5)
+        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
         print(knl)
 
         return knl
 
     for variant in [variant_orig]:
-        K = 1000
+        K = 1000  # noqa
         lp.auto_test_vs_ref(seq_knl, ctx, variant(knl),
                 op_count=[K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9],
                 op_label=["GFlops"],
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 29cb5f98ed273a178a0dd4cc927b952ec41f8d76..b9ec0a1c48b0e579b1029efeef81aa1877b68b76 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -26,7 +26,8 @@ import sys
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
-from loopy.statistics import *  # noqa
+import loopy as lp
+from loopy.statistics import get_op_poly  # noqa
 import numpy as np
 
 
@@ -131,7 +132,7 @@ def test_op_counter_bitwise():
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
-                e[i, k] = (g[i,k] ^ k)*~(h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
+                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                 """
             ],
             name="bitwise", assumptions="n,m,l >= 1")
@@ -140,6 +141,7 @@ def test_op_counter_bitwise():
             knl, dict(
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
+
     poly = get_op_poly(knl)
 
     n = 10
@@ -192,3 +194,4 @@ if __name__ == "__main__":
     else:
         from py.test.cmdline import main
         main([__file__])
+