diff --git a/doc/reference.rst b/doc/reference.rst
index c829462d37855b54bac993174c37a2faef2a1dbf..bc8b9c484b2b6e28c13ad82c6df8c01316c38bdc 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -296,6 +296,9 @@ These are usually key-value pairs. The following attributes are recognized:
   variables (which must be scalar variables) evaluate to ``true`` (as
   defined by C).
 
+* ``tags=tag1:tag2`` Apply tags to this instruction that can then be used
+  for :ref:`context-matching`.
+
 .. autoclass:: ExpressionInstruction
 
 
@@ -334,9 +337,13 @@ function, which is responsible for creating kernels:
 Transforming Kernels
 --------------------
 
+.. _context-matching:
+
 Matching contexts
 ^^^^^^^^^^^^^^^^^
 
+TODO: Matching instruction tags
+
 .. automodule:: loopy.context_matching
 
 .. autofunction:: parse_id_match
diff --git a/loopy/context_matching.py b/loopy/context_matching.py
index a80aa04c9cc85c7bc6e925c131d836c48a0273de..eb85c1dd0a4c0b7b55607ec5358d9e3ef18470cb 100644
--- a/loopy/context_matching.py
+++ b/loopy/context_matching.py
@@ -27,6 +27,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import types
+
 
 # {{{ id match objects
 
@@ -40,26 +42,30 @@ class RegexIdentifierMatch(object):
         self.id_re = id_re
         self.tag_re = tag_re
 
-    def __call__(self, identifier, tag):
+    def __call__(self, identifier, tags):
+        assert isinstance(tags, (tuple, types.NoneType))
+
         if self.tag_re is None:
             return self.id_re.match(identifier) is not None
         else:
-            if tag is None:
-                tag = ""
+            if not tags:
+                tags = ("",)
 
             return (
                     self.id_re.match(identifier) is not None
-                    and self.tag_re.match(tag) is not None)
+                    and any(
+                        self.tag_re.match(tag) is not None
+                        for tag in tags))
 
 
 class AlternativeMatch(object):
     def __init__(self, matches):
         self.matches = matches
 
-    def __call__(self, identifier, tag):
+    def __call__(self, identifier, tags):
         from pytools import any
         return any(
-                mtch(identifier, tag) for mtch in self.matches)
+                mtch(identifier, tags) for mtch in self.matches)
 
 # }}}
 
@@ -206,5 +212,4 @@ def parse_stack_match(smatch):
 # }}}
 
 
-
 # vim: foldmethod=marker
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index b932d448a47cd328755c20b7e567642276f1565a..0727000f9d1b5cda84359c0d7ac3e49e5694f028 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -198,6 +198,7 @@ def parse_insn(insn):
         forced_iname_deps_is_final = False
         forced_iname_deps = frozenset()
         predicates = frozenset()
+        tags = ()
 
         if groups["options"] is not None:
             for option in groups["options"].split(","):
@@ -244,8 +245,15 @@ def parse_insn(insn):
                         forced_iname_deps_is_final = True
 
                     forced_iname_deps = frozenset(opt_value.split(":"))
+
                 elif opt_key == "if":
                     predicates = frozenset(opt_value.split(":"))
+
+                elif opt_key == "tags":
+                    tags = tuple(
+                            tag.strip() for tag in opt_value.split(":")
+                            if tag.strip())
+
                 else:
                     raise ValueError("unrecognized instruction option '%s'"
                             % opt_key)
@@ -273,7 +281,8 @@ def parse_insn(insn):
                     assignee=lhs, expression=rhs,
                     temp_var_type=temp_var_type,
                     priority=priority,
-                    predicates=predicates), inames_to_dup
+                    predicates=predicates,
+                    tags=tags), inames_to_dup
 
     elif subst_match is not None:
         from pymbolic.primitives import Variable, Call
@@ -450,16 +459,16 @@ class ArgumentGuesser:
                 (assignee_var_name, _), = insn.assignees_and_indices()
                 self.all_written_names.add(assignee_var_name)
                 self.all_names.update(get_dependencies(
-                    self.submap(insn.assignee, insn.id)))
+                    self.submap(insn.assignee, insn.id, insn.tags)))
                 self.all_names.update(get_dependencies(
-                    self.submap(insn.expression, insn.id)))
+                    self.submap(insn.expression, insn.id, insn.tags)))
 
     def find_index_rank(self, name):
         irf = IndexRankFinder(name)
 
         for insn in self.instructions:
             insn.with_transformed_expressions(
-                    lambda expr: irf(self.submap(expr, insn.id)))
+                    lambda expr: irf(self.submap(expr, insn.id, insn.tags)))
 
         if not irf.index_ranks:
             return 0
@@ -860,9 +869,9 @@ def guess_arg_shape_if_requested(kernel, default_order):
             try:
                 for insn in kernel.instructions:
                     if isinstance(insn, lp.ExpressionInstruction):
-                        armap(submap(insn.assignee, insn.id),
+                        armap(submap(insn.assignee, insn.id, insn.tags),
                                 kernel.insn_inames(insn))
-                        armap(submap(insn.expression, insn.id),
+                        armap(submap(insn.expression, insn.id, insn.tags),
                                 kernel.insn_inames(insn))
             except TypeError as e:
                 from loopy.diagnostic import LoopyError
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index a7e4720147b4f6e8d7ab4afbd6b83db2dc339bb2..a563b9ef39560dacfa77943e037d4a25b2213c15 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -457,6 +457,11 @@ class InstructionBase(Record):
         A :class:`set` of inames into which the instruction
         may need to be boosted, as a heuristic help for the scheduler.
         Also allowed to be *None*.
+
+    .. attribute:: tags
+
+        A tuple of string identifiers that can be used to identify groups
+        of statements.
     """
 
     fields = set("id insn_deps insn_deps_is_final predicates "
@@ -465,7 +470,7 @@ class InstructionBase(Record):
 
     def __init__(self, id, insn_deps, insn_deps_is_final,
             forced_iname_deps_is_final, forced_iname_deps, priority,
-            boostable, boostable_into, predicates):
+            boostable, boostable_into, predicates, tags):
 
         if forced_iname_deps_is_final is None:
             forced_iname_deps_is_final = False
@@ -473,6 +478,9 @@ class InstructionBase(Record):
         if insn_deps_is_final is None:
             insn_deps_is_final = False
 
+        if tags is None:
+            tags = ()
+
         assert isinstance(forced_iname_deps, frozenset)
         assert isinstance(insn_deps, frozenset) or insn_deps is None
 
@@ -485,7 +493,8 @@ class InstructionBase(Record):
                 priority=priority,
                 boostable=boostable,
                 boostable_into=boostable_into,
-                predicates=predicates)
+                predicates=predicates,
+                tags=tags)
 
     # {{{ abstract interface
 
@@ -551,6 +560,8 @@ class InstructionBase(Record):
             result.append("deps="+":".join(self.insn_deps))
         if self.priority:
             result.append("priority=%d" % self.priority)
+        if self.tags:
+            result.append("tags=%s" % ":".join(self.tags))
 
         return result
 
@@ -629,7 +640,7 @@ class ExpressionInstruction(InstructionBase):
             forced_iname_deps=frozenset(),
             insn_deps=None,
             insn_deps_is_final=None,
-            boostable=None, boostable_into=None,
+            boostable=None, boostable_into=None, tags=None,
             temp_var_type=None, priority=0, predicates=frozenset()):
 
         InstructionBase.__init__(self,
@@ -641,7 +652,8 @@ class ExpressionInstruction(InstructionBase):
                 boostable=boostable,
                 boostable_into=boostable_into,
                 priority=priority,
-                predicates=predicates)
+                predicates=predicates,
+                tags=tags)
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -795,7 +807,7 @@ class CInstruction(InstructionBase):
             id=None, insn_deps=None, insn_deps_is_final=None,
             forced_iname_deps_is_final=None, forced_iname_deps=frozenset(),
             priority=0, boostable=None, boostable_into=None,
-            predicates=frozenset()):
+            predicates=frozenset(), tags=None):
         """
         :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples,
             simple strings pepresenting inames are also allowed. A single
@@ -814,7 +826,7 @@ class CInstruction(InstructionBase):
                 insn_deps_is_final=insn_deps_is_final,
                 boostable=boostable,
                 boostable_into=boostable_into,
-                priority=priority, predicates=predicates)
+                priority=priority, predicates=predicates, tags=tags)
 
         # {{{ normalize iname_exprs
 
@@ -914,6 +926,7 @@ class CInstruction(InstructionBase):
                     key_builder.update_for_pymbolic_expression(key_hash, val)
             else:
                 key_builder.rec(key_hash, getattr(self, field_name))
+
 # }}}
 
 # }}}
diff --git a/loopy/precompute.py b/loopy/precompute.py
index 485c9c954552bc42da237021556abfbee3ac9f02..21e6e5ddf394301ce9fcf9c54c682adf1dee3f60 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -407,7 +407,8 @@ class InvocationGatherer(ExpandingIdentityMapper):
         arg_deps = set()
         for arg_val in six.itervalues(arg_context):
             arg_deps = (arg_deps
-                    | get_dependencies(self.subst_expander(arg_val, insn_id=None)))
+                    | get_dependencies(self.subst_expander(
+                        arg_val, insn_id=None, insn_tags=None)))
 
         if not arg_deps <= self.kernel.all_inames():
             from warnings import warn
@@ -680,7 +681,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     import loopy as lp
     for insn in kernel.instructions:
         if isinstance(insn, lp.ExpressionInstruction):
-            invg(insn.expression, insn.id)
+            invg(insn.expression, insn.id, insn.tags)
 
     for invdesc in invg.invocation_descriptors:
         invocation_descriptors.append(
@@ -717,7 +718,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     submap = SubstitutionRuleExpander(kernel.substitutions)
 
     value_inames = get_dependencies(
-            submap(subst.expression, insn_id=None)) & kernel.all_inames()
+            submap(subst.expression, insn_id=None, insn_tags=None)
+            ) & kernel.all_inames()
     if value_inames - expanding_usage_arg_deps < extra_storage_axes:
         raise RuntimeError("unreferenced sweep inames specified: "
                 + ", ".join(extra_storage_axes
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 0adea202a1a9319c64a13e85f819a60492b9158a..8edddc2bdfa78cec6674e702661b1d6f6f1a4057 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -62,7 +62,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         if not isinstance(writer_insn, lp.ExpressionInstruction):
             continue
 
-        expr = subst_expander(writer_insn.expression, insn_id=writer_insn_id)
+        expr = subst_expander(writer_insn.expression,
+                insn_id=writer_insn_id, insn_tags=writer_insn.tags)
 
         try:
             debug("             via expr %s" % expr)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index aec1c15efc3c5079c793bfb5e1aa0d05470f2977..3b3eba49e8cec0d62fd2a57bcb19d795dcb40167 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -429,8 +429,14 @@ class ExpandingIdentityMapper(IdentityMapper):
         rule = self.old_subst_rules[name]
 
         rec_arguments = self.rec(arguments, expn_state)
+
+        if tag is None:
+            tags = None
+        else:
+            tags = (tag,)
+
         new_expn_state = expn_state.copy(
-                stack=expn_state.stack + ((name, tag),),
+                stack=expn_state.stack + ((name, tags),),
                 arg_context=self.make_new_arg_context(
                     name, rule.arguments, rec_arguments, expn_state.arg_context))
 
@@ -448,9 +454,9 @@ class ExpandingIdentityMapper(IdentityMapper):
         else:
             return sym
 
-    def __call__(self, expr, insn_id):
+    def __call__(self, expr, insn_id, insn_tags):
         if insn_id is not None:
-            stack = ((insn_id, None),)
+            stack = ((insn_id, insn_tags),)
         else:
             stack = ()
 
@@ -520,7 +526,7 @@ class ExpandingIdentityMapper(IdentityMapper):
                 # may perform tasks entirely unrelated to subst rules, so
                 # we must map assignees, too.
 
-                insn.with_transformed_expressions(self, insn.id)
+                insn.with_transformed_expressions(self, insn.id, insn.tags)
                 for insn in kernel.instructions]
 
         new_substs, renames = self._get_new_substitutions_and_renames()
@@ -564,7 +570,13 @@ class SubstitutionRuleExpander(ExpandingIdentityMapper):
         self.ctx_match = ctx_match
 
     def map_substitution(self, name, tag, arguments, expn_state):
-        new_stack = expn_state.stack + ((name, tag),)
+        if tag is None:
+            tags = None
+        else:
+            tags = (tag,)
+
+        new_stack = expn_state.stack + ((name, tags),)
+
         if self.ctx_match(new_stack):
             # expand
             rule = self.old_subst_rules[name]