diff --git a/MEMO b/MEMO
index 281e17845371a4c6c666824914bc768187818c6b..d7526fd0da6cb2e42cb43d0980f21b808263732d 100644
--- a/MEMO
+++ b/MEMO
@@ -40,11 +40,15 @@ Things to consider
 - implemented_domain may end up being smaller than requested in cse
   evaluations--check that!
 
-- Auto tag assignment depends on known work group size
-
 - Depedencies are pointwise for shared loop dimensions
   and global over non-shared ones (between dependent and ancestor)
 
+- multiple insns could fight over which iname gets local axis 0
+  -> complicated optimization problem
+
+TODO
+^^^^
+
 - Parallel dimension splitting/merging via tags
 
 - FIXME: Deal with insns losing a seq iname dep in a CSE realization
@@ -53,19 +57,13 @@ Things to consider
 
 - Every loop in loopy is opened at most once.
 
-- Syntax to declare insn deps
-
 - reimplement add_prefetch
 
 - user interface for dim length prescription
 
-- make syntax for explicit loop dependencies
-
-- multiple insns could fight over which iname gets local axis 0
-  -> complicated optimization problem
-
 - How to determine which variables need to be duplicated for ILP?
-  -> Only reduction
+  -> Reduction
+  -> CSEs?
 
 - Slab decomposition for parallel dimensions
   - implement at the outermost nesting level regardless
@@ -79,6 +77,8 @@ Things to consider
 Dealt with
 ^^^^^^^^^^
 
+- make syntax for explicit loop dependencies
+
 - Implement get_problems()
 
 - CSE iname duplication might be unnecessary?
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6f1c9b930e51177ccde0f892a577ae3efcb7eace..ce40f0d2b98dd139a41ac27a50e40d289173c83d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -30,7 +30,6 @@ from loopy.kernel import ScalarArg, ArrayArg, ImageArg
 
 from loopy.kernel import LoopKernel
 from loopy.schedule import generate_loop_schedules
-from loopy.prefetch import insert_register_prefetches
 from loopy.compiled import CompiledKernel, drive_timing_run
 
 # }}}
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 762a4c4b6777e0e40f40d64552fdb1b9a72b779a..59a9846b15c9c817c255a4c188d19e477b32379f 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -402,27 +402,39 @@ class LoopKernel(Record):
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
         """
+        import re
+        LABEL_DEP_RE = re.compile(
+                r"^(?:\{(?P<label>\w+)\})?"
+                "\s*(?P<lhs>.+)\s*=\s*(?P<rhs>.+)\s*"
+                "(?:\:\s*(?P<deps>[\s\w,]+))?$"
+                )
 
         def parse_if_necessary(insn):
             from pymbolic import parse
 
+            deps = []
+            label = "insn"
+
             if isinstance(insn, Instruction):
                 return insn
             if isinstance(insn, str):
-                lhs, rhs = insn.split("=")
-            elif isinstance(insn, tuple):
-                lhs, rhs = insn
+                label_dep_match = LABEL_DEP_RE.match(insn)
+                if label_dep_match is None:
+                    raise RuntimeError("insn parse error")
 
-            if isinstance(lhs, str):
-                lhs = parse(lhs)
+                groups = label_dep_match.groupdict()
+                if groups["label"] is not None:
+                    label = groups["label"]
+                if groups["deps"] is not None:
+                    deps = [dep.trim() for dep in groups["deps"].split(",")]
 
-            if isinstance(rhs, str):
+                lhs = parse(groups["lhs"])
                 from loopy.symbolic import FunctionToPrimitiveMapper
-                rhs = parse(rhs)
-                rhs = FunctionToPrimitiveMapper()(rhs)
+                rhs = FunctionToPrimitiveMapper()(parse(groups["rhs"]))
 
             return Instruction(
-                    id=self.make_unique_instruction_id(insns),
+                    id=self.make_unique_instruction_id(insns, based_on=label),
+                    insn_deps=deps,
                     assignee=lhs, expression=rhs)
 
         if isinstance(domain, str):
@@ -522,12 +534,6 @@ class LoopKernel(Record):
             if var_name not in used_vars:
                 return var_name
 
-    @property
-    @memoize_method
-    def dim_to_name(self):
-        from pytools import reverse_dict
-        return reverse_dict(self.iname_to_dim)
-
     @property
     @memoize_method
     def id_to_insn(self):
@@ -559,10 +565,6 @@ class LoopKernel(Record):
         from islpy import dim_type
         return set(self.space.get_var_dict(dim_type.set).iterkeys())
 
-    def inames_by_tag_type(self, tag_type):
-        return [iname for iname in self.all_inames()
-                if isinstance(self.iname_to_tag.get(iname), tag_type)]
-
     @memoize_method
     def get_iname_bounds(self, iname):
         lower_bound_pw_aff = (self.domain
diff --git a/test/test_matmul.py b/test/test_matmul.py
index e342289be9e4d678bfd7794fd8d22cf7dea07603..b928b57c820524f4d93e4e68eade2dee930861ab 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -203,7 +203,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
     knl = lp.LoopKernel(ctx.devices[0],
             "[n] -> {[i,j,k]: 0<=i,j,k<n}",
             [
-                "c[i, j] = reduce(sum_float32, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))"
+                "{yo} c[i, j] = reduce(sum_float32, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),