From 9956ff0566adec2e080073502d19ca7aa5547877 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 3 Nov 2011 03:32:28 -0400
Subject: [PATCH] Find insn iname deps by fixed point iteration. Dot dependency
 graphing. Schedule improvements.

insn.all_inames() -> kernel.insn_inames(insn)
Scheduling:
- Only schedule referenced inames.
- Only work towards insns that have their dependencies satisfied.
---
 loopy/__init__.py            |   5 +-
 loopy/check.py               |  20 ++---
 loopy/codegen/instruction.py |   4 +-
 loopy/cse.py                 |  11 +--
 loopy/kernel.py              | 156 ++++++++++++++++++++++++++++++-----
 loopy/preprocess.py          |  54 +++---------
 loopy/schedule.py            |  20 +++--
 loopy/symbolic.py            |  22 ++++-
 test/test_sem.py             |  26 +++---
 9 files changed, 211 insertions(+), 107 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index d952538a3..a66b9958f 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,7 +22,7 @@ class LoopyAdvisory(UserWarning):
 
 from loopy.kernel import ScalarArg, ArrayArg, ImageArg
 
-from loopy.kernel import AutoFitLocalIndexTag
+from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph
 from loopy.cse import realize_cse
 from loopy.preprocess import preprocess_kernel
 from loopy.schedule import generate_loop_schedules
@@ -31,6 +31,7 @@ from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq
 from loopy.check import check_kernels
 
 __all__ = ["ScalarArg", "ArrayArg", "ImageArg",
+        "get_dot_dependency_graph",
         "preprocess_kernel", "generate_loop_schedules",
         "generate_code",
         "CompiledKernel", "drive_timing_run", "check_kernels",
@@ -155,7 +156,7 @@ def make_kernel(*args, **kwargs):
             from pymbolic.primitives import Variable
             for index_expr in insn.get_assignee_indices():
                 if (not isinstance(index_expr, Variable)
-                        or not index_expr.name in insn.all_inames()):
+                        or not index_expr.name in knl.insn_inames(insn)):
                     raise RuntimeError(
                             "only plain inames are allowed in "
                             "the lvalue index when declaring the "
diff --git a/loopy/check.py b/loopy/check.py
index 8eb853035..d8c63a302 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -21,7 +21,7 @@ def check_for_unused_hw_axes_in_insns(kernel):
         group_axes_used = set()
         local_axes_used = set()
 
-        for iname in insn.all_inames():
+        for iname in kernel.insn_inames(insn):
             tag = kernel.iname_to_tag.get(iname)
 
             if isinstance(tag, LocalIndexTag):
@@ -53,7 +53,7 @@ def check_for_double_use_of_hw_axes(kernel):
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
-        for iname in insn.all_inames():
+        for iname in kernel.insn_inames(insn):
             tag = kernel.iname_to_tag.get(iname)
             if isinstance(tag, UniqueTag):
                 key = tag.key
@@ -74,7 +74,7 @@ def check_for_inactive_iname_access(kernel):
         expression_indices = depmap(insn.expression)
         expression_inames = expression_indices & kernel.all_inames()
 
-        if not expression_inames <= insn.all_inames():
+        if not expression_inames <= kernel.insn_inames(insn):
             raise RuntimeError(
                     "instructiosn '%s' references "
                     "inames that the instruction does not depend on"
@@ -100,7 +100,7 @@ def check_for_write_races(kernel):
         assignee_indices = set(strip_var(index) for index in assignee_indices)
 
         assignee_inames = assignee_indices & kernel.all_inames()
-        if not assignee_inames <= insn.all_inames():
+        if not assignee_inames <= kernel.insn_inames(insn):
             raise RuntimeError(
                     "assignee of instructiosn '%s' references "
                     "iname that the instruction does not depend on"
@@ -114,7 +114,7 @@ def check_for_write_races(kernel):
 
             parallel_insn_inames = set(
                     iname
-                    for iname in insn.all_inames()
+                    for iname in kernel.insn_inames(insn)
                     if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
 
             inames_without_write_dep = parallel_insn_inames - (
@@ -125,7 +125,7 @@ def check_for_write_races(kernel):
             if temp_var.is_local == True:
                 local_parallel_insn_inames = set(
                         iname
-                        for iname in insn.all_inames()
+                        for iname in kernel.insn_inames(insn)
                         if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)
                         and not isinstance(kernel.iname_to_tag.get(iname), GroupIndexTag))
 
@@ -135,7 +135,7 @@ def check_for_write_races(kernel):
             elif temp_var.is_local == False:
                 ilp_inames = set(
                         iname
-                        for iname in insn.all_inames()
+                        for iname in kernel.insn_inames(insn)
                         if isinstance(kernel.iname_to_tag.get(iname), IlpTag))
 
                 inames_without_write_dep = ilp_inames - (
@@ -204,10 +204,10 @@ def check_implemented_domains(kernel, implemented_domains):
             insn_impl_domain = insn_impl_domain | idomain
         insn_impl_domain = (
                 (insn_impl_domain & assumptions)
-                .project_out_except(insn.all_inames(), [dim_type.set]))
+                .project_out_except(kernel.insn_inames(insn), [dim_type.set]))
 
         desired_domain = ((kernel.domain & assumptions)
-            .project_out_except(insn.all_inames(), [dim_type.set]))
+            .project_out_except(kernel.insn_inames(insn), [dim_type.set]))
 
         if insn_impl_domain != desired_domain:
             i_minus_d = insn_impl_domain - desired_domain
@@ -228,7 +228,7 @@ def check_implemented_domains(kernel, implemented_domains):
 
                 iname_to_dim = pt.get_space().get_var_dict()
                 point_axes = []
-                for iname in insn.all_inames() | parameter_inames:
+                for iname in kernel.insn_inames(insn) | parameter_inames:
                     tp, dim = iname_to_dim[iname]
                     point_axes.append("%s=%d" % (iname, pt.get_coordinate(tp, dim)))
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e67898ebd..688ae2c7f 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -52,7 +52,7 @@ def generate_ilp_instances(kernel, insn, codegen_state):
 
     # {{{ pass 2: treat all ILP dimensions
 
-    for iname in insn.all_inames():
+    for iname in kernel.insn_inames(insn):
         tag = kernel.iname_to_tag.get(iname)
 
         if not isinstance(tag, IlpTag):
@@ -99,7 +99,7 @@ def generate_instruction_code(kernel, insn, codegen_state):
         insn_code = Assign(ccm(insn.assignee), ccm(insn.expression))
         from loopy.codegen.bounds import wrap_in_bounds_checks
         insn_code, impl_domain = wrap_in_bounds_checks(
-                ccm, kernel.domain, insn.all_inames(), ilpi.implemented_domain,
+                ccm, kernel.domain, kernel.insn_inames(insn), ilpi.implemented_domain,
                 insn_code)
 
         result.append(GeneratedInstruction(
diff --git a/loopy/cse.py b/loopy/cse.py
index e76145292..ab576a701 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -283,13 +283,11 @@ def make_compute_insn(kernel, cse_tag, lead_expr, target_var_name,
 
     # {{{ decide whether to force a dep
 
-    forced_iname_deps = set()
-
     from loopy.symbolic import IndexVariableFinder
     dependencies = IndexVariableFinder(
             include_reduction_inames=False)(lead_expr)
 
-    parent_inames = insn.all_inames() | insn.reduction_inames()
+    parent_inames = kernel.insn_inames(insn) | insn.reduction_inames()
     #print dependencies, parent_inames
     #assert dependencies <= parent_inames
 
@@ -326,8 +324,7 @@ def make_compute_insn(kernel, cse_tag, lead_expr, target_var_name,
     return Instruction(
             id=kernel.make_unique_instruction_id(based_on=insn_prefix+"_compute"),
             assignee=assignee,
-            expression=new_inner_expr,
-            forced_iname_deps=forced_iname_deps)
+            expression=new_inner_expr)
 
 
 
@@ -499,9 +496,7 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[],
 
     for insn in kernel.instructions:
         new_expr = cse_cb_mapper(insn.expression)
-        new_insns.append(insn.copy(
-            expression=new_expr,
-            forced_iname_deps=insn.all_inames()))
+        new_insns.append(insn.copy(expression=new_expr))
 
     # }}}
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index ee1bcba47..53ef81e68 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -259,8 +259,8 @@ class Instruction(Record):
     def reduction_inames(self):
         def map_reduction(expr, rec):
             rec(expr.expr)
-            for iname in expr.inames:
-                result.add(iname.lstrip("@"))
+            for iname in expr.untagged_inames:
+                result.add(iname)
 
         from loopy.symbolic import ReductionCallbackMapper
         cb_mapper = ReductionCallbackMapper(map_reduction)
@@ -270,19 +270,9 @@ class Instruction(Record):
 
         return result
 
-    @memoize_method
-    def all_inames(self):
-        """Does not (!) include reduction inames."""
-
-        from loopy.symbolic import IndexVariableFinder
-        ivarf = IndexVariableFinder(include_reduction_inames=False)
-        index_vars = (ivarf(self.expression) | ivarf(self.assignee))
-
-        return index_vars | set(self.forced_iname_deps)
-
     def __str__(self):
-        result = "%s: %s <- %s\n    [%s]" % (self.id,
-                self.assignee, self.expression, ", ".join(sorted(self.all_inames())))
+        result = "%s: %s <- %s" % (self.id,
+                self.assignee, self.expression)
 
         if self.boostable == True:
             result += " (boostable)"
@@ -658,6 +648,76 @@ class LoopKernel(Record):
             if id_str not in used_ids:
                 return id_str
 
+    @memoize_method
+    def all_inames(self):
+        from islpy import dim_type
+        return set(self.space.get_var_dict(dim_type.set).iterkeys())
+
+    @memoize_method
+    def all_insn_inames(self):
+        from loopy.symbolic import get_dependencies
+
+        insn_id_to_inames = {}
+        insn_assignee_inames = {}
+
+        for insn in self.instructions:
+            read_deps = get_dependencies(insn.expression)
+            write_deps = get_dependencies(insn.assignee)
+            deps = read_deps | write_deps
+
+            iname_deps = (
+                    deps & self.all_inames()
+                    | insn.forced_iname_deps)
+
+            insn_id_to_inames[insn.id] = iname_deps
+            insn_assignee_inames[insn.id] = write_deps & self.all_inames()
+
+        writers = self.find_writers()
+        temp_var_names = set(self.temporary_variables.iterkeys())
+
+        # fixed point iteration until all iname dep sets have converged
+        while True:
+            did_something = False
+            for insn in self.instructions:
+                for tv_name in (get_dependencies(insn.expression)
+                        & temp_var_names):
+                    implicit_inames = None
+
+                    for writer_id in writers[tv_name]:
+                        writer_implicit_inames = (
+                                insn_id_to_inames[writer_id]
+                                - insn_assignee_inames[writer_id])
+                        if implicit_inames is None:
+                            implicit_inames = writer_implicit_inames
+                        else:
+                            implicit_inames = (implicit_inames
+                                    & writer_implicit_inames)
+
+                    inames_old = insn_id_to_inames[insn.id]
+                    inames_new = inames_old | implicit_inames
+                    insn_id_to_inames[insn.id] = inames_new
+
+                    if inames_new != inames_old:
+                        did_something = True
+
+            if not did_something:
+                break
+
+        return insn_id_to_inames
+
+    @memoize_method
+    def all_referenced_inames(self):
+        result = set()
+        for inames in self.all_insn_inames().itervalues():
+            result.update(inames)
+        return result
+
+    def insn_inames(self, insn):
+        if isinstance(insn, str):
+            return self.all_insn_inames()[insn]
+        else:
+            return self.all_insn_inames()[insn.id]
+
     @property
     @memoize_method
     def sequential_inames(self):
@@ -680,6 +740,44 @@ class LoopKernel(Record):
 
         return result
 
+    def find_readers(self):
+        """
+        :return: a dict that maps variable names to ids of insns that
+            read that variable.
+        """
+        result = {}
+
+        admissible_vars = (
+                set(arg.name for arg in self.args)
+                | set(self.temporary_variables.iterkeys()))
+
+        for insn in self.instructions:
+            for var_name in insn.get_read_var_names() & admissible_vars:
+                result.setdefault(var_name, set()).add(insn.id)
+
+    def find_writers(self):
+        """
+        :return: a dict that maps variable names to ids of insns that
+            write to that variable.
+        """
+        result = {}
+
+        admissible_vars = (
+                set(arg.name for arg in self.args)
+                | set(self.temporary_variables.iterkeys()))
+
+        for insn in self.instructions:
+            var_name = insn.get_assignee_var_name()
+
+            if var_name not in admissible_vars:
+                raise RuntimeError("writing to '%s' is not allowed" % var_name)
+            var_names = [var_name]
+
+            for var_name in var_names:
+                result.setdefault(var_name, set()).add(insn.id)
+
+        return result
+
     @property
     @memoize_method
     def iname_to_dim(self):
@@ -729,11 +827,6 @@ class LoopKernel(Record):
             return [arg.name for arg in self.args if isinstance(arg, ScalarArg)
                     if arg.name in loop_arg_names]
 
-    @memoize_method
-    def all_inames(self):
-        from islpy import dim_type
-        return set(self.space.get_var_dict(dim_type.set).iterkeys())
-
     @memoize_method
     def get_iname_bounds(self, iname):
         dom_intersect_assumptions = (
@@ -771,7 +864,7 @@ class LoopKernel(Record):
     def get_grid_sizes(self, ignore_auto=False):
         all_inames_by_insns = set()
         for insn in self.instructions:
-            all_inames_by_insns |= insn.all_inames()
+            all_inames_by_insns |= self.insn_inames(insn)
 
         if not all_inames_by_insns <= self.all_inames():
             raise RuntimeError("inames collected from instructions (%s) "
@@ -888,6 +981,7 @@ class LoopKernel(Record):
         lines.append("")
         for insn in self.instructions:
             lines.append(str(insn))
+            lines.append("    [%s]" % ",".join(sorted(self.insn_inames(insn))))
 
         return "\n".join(lines)
 
@@ -917,4 +1011,26 @@ def find_var_base_indices_and_shape_from_inames(domain, inames):
 
 
 
+def get_dot_dependency_graph(kernel, iname_cluster=False, iname_edge=True):
+    lines = []
+    for insn in kernel.instructions:
+        lines.append("%s [shape=\"box\"];" % insn.id)
+        for dep in insn.insn_deps:
+            lines.append("%s -> %s;" % (dep, insn.id))
+
+        if iname_edge:
+            for iname in kernel.insn_inames(insn):
+                lines.append("%s -> %s [style=\"dotted\"];" % (iname, insn.id))
+
+    if iname_cluster:
+        for iname in kernel.all_inames():
+            lines.append("subgraph cluster_%s { label=\"%s\" %s }" % (iname, iname,
+                " ".join(insn.id for insn in kernel.instructions
+                    if iname in kernel.insn_inames(insn))))
+
+    return "digraph loopy_deps {\n%s\n}" % "\n".join(lines)
+
+
+
+
 # vim: foldmethod=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index eede44a43..ca6b0d0c6 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -12,7 +12,7 @@ def mark_local_temporaries(kernel):
     new_temp_vars = {}
     from loopy.kernel import LocalIndexTagBase
 
-    writers = find_accessors(kernel, readers=False)
+    writers = kernel.find_writers()
 
     from loopy.symbolic import get_dependencies
 
@@ -65,11 +65,10 @@ def duplicate_reduction_inames(kernel):
             from pymbolic.mapper.substitutor import make_subst_func
             from pymbolic import var
 
-            old_inames = [iname.lstrip("@") for iname in reduction_expr.inames]
             subst_dict = dict(
                     (old_iname, var(new_iname))
                     for old_iname, new_iname in zip(
-                        old_inames, new_red_inames))
+                        reduction_expr.untagged_inames, new_red_inames))
             subst_map = SubstitutionMapper(make_subst_func(subst_dict))
 
             child = subst_map(child)
@@ -118,7 +117,7 @@ def realize_reduction(kernel):
         # {{{ see if this reduction is nested inside some ILP loops
 
         ilp_inames = [iname
-                for iname in insn.all_inames()
+                for iname in kernel.insn_inames(insn)
                 if isinstance(kernel.iname_to_tag.get(iname), IlpTag)]
 
         from loopy.isl_helpers import static_max_of_pw_aff
@@ -158,7 +157,7 @@ def realize_reduction(kernel):
                     based_on="%s_%s_init" % (insn.id, "_".join(expr.inames)),
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
-                forced_iname_deps=insn.all_inames() - set(expr.inames),
+                forced_iname_deps=kernel.insn_inames(insn) - set(expr.inames),
                 expression=expr.operation.neutral_element)
 
         new_insns.append(init_insn)
@@ -170,7 +169,7 @@ def realize_reduction(kernel):
                 assignee=target_var,
                 expression=expr.operation(target_var, sub_expr),
                 insn_deps=set([init_insn.id]) | insn.insn_deps,
-                forced_iname_deps=insn.all_inames() | set(expr.inames))
+                forced_iname_deps=kernel.insn_inames(insn) | set(expr.inames))
 
         new_insns.append(reduction_insn)
 
@@ -190,7 +189,7 @@ def realize_reduction(kernel):
                     expression=new_expression,
                     insn_deps=insn.insn_deps
                         | new_insn_insn_deps,
-                    forced_iname_deps=insn.all_inames())
+                    forced_iname_deps=kernel.insn_inames(insn))
 
         new_insns.append(new_insn)
 
@@ -202,39 +201,8 @@ def realize_reduction(kernel):
 
 # {{{ automatic dependencies, find boostability of instructions
 
-def find_accessors(kernel, readers):
-    """
-    :arg readers: whether to find insns that read or that write
-        the variables in question.
-    :return: a dict that maps variable names to ids of insns that
-        write to that variable.
-    """
-    result = {}
-
-    admissible_vars = (
-            set(arg.name for arg in kernel.args)
-            | set(kernel.temporary_variables.iterkeys()))
-
-    for insn in kernel.instructions:
-        if readers:
-            var_names = insn.get_read_var_names() & admissible_vars
-        else:
-            var_name = insn.get_assignee_var_name()
-
-            if var_name not in admissible_vars:
-                raise RuntimeError("writing to '%s' is not allowed" % var_name)
-            var_names = [var_name]
-
-        for var_name in var_names:
-            result.setdefault(var_name, set()).add(insn.id)
-
-    return result
-
-
-
-
 def add_boostability_and_automatic_dependencies(kernel):
-    writer_map = find_accessors(kernel, readers=False)
+    writer_map = kernel.find_writers()
 
     arg_names = set(arg.name for arg in kernel.args)
 
@@ -363,7 +331,7 @@ def get_axis_0_ranking(kernel, insn):
     from loopy.kernel import AutoLocalIndexTagBase
     axis0_candidates = set(
             iname
-            for iname in insn.all_inames()
+            for iname in kernel.insn_inames(insn)
             if isinstance(kernel.iname_to_tag.get(iname),
                 AutoLocalIndexTagBase))
 
@@ -427,7 +395,7 @@ def get_axis_0_ranking(kernel, insn):
                     + vote_strength)
 
     if saw_relevant_access:
-        return sorted((iname for iname in insn.all_inames()),
+        return sorted((iname for iname in kernel.insn_inames(insn)),
                 key=lambda iname: vote_count_for_l0.get(iname, 0),
                 reverse=True)
     else:
@@ -524,7 +492,7 @@ def assign_automatic_axes(kernel, phase="axis0", local_size=None):
     for insn in kernel.instructions:
         auto_axis_inames = [
                 iname
-                for iname in insn.all_inames()
+                for iname in kernel.insn_inames(insn)
                 if isinstance(kernel.iname_to_tag.get(iname),
                     AutoLocalIndexTagBase)]
 
@@ -533,7 +501,7 @@ def assign_automatic_axes(kernel, phase="axis0", local_size=None):
 
         assigned_local_axes = set()
 
-        for iname in insn.all_inames():
+        for iname in kernel.insn_inames(insn):
             tag = kernel.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 assigned_local_axes.add(tag.axis)
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 7b8524cf9..b27069670 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -141,7 +141,7 @@ def find_used_inames_within(kernel, sched_index):
 
     result = set()
     for sched_item in run_insns:
-        result.update(kernel.id_to_insn[sched_item.insn_id].all_inames())
+        result.update(kernel.insn_inames(sched_item.insn_id))
 
     return result
 
@@ -218,11 +218,14 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
     # {{{ see if any insn can be scheduled now
 
     unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids)
+    insns_with_satisfied_deps = set()
 
     for insn_id in unscheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
 
         schedule_now = set(insn.insn_deps) <= scheduled_insn_ids
+        if schedule_now:
+            insns_with_satisfied_deps.add(insn_id)
 
         if not schedule_now:
             if debug_mode:
@@ -242,7 +245,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
             for active_loop_count in xrange(len(active_inames), -1, -1):
                 outer_active_inames = set(active_inames[:active_loop_count])
                 if (
-                        insn.all_inames() - parallel_inames
+                        kernel.insn_inames(insn) - parallel_inames
                         <=
                         outer_active_inames - parallel_inames):
 
@@ -257,7 +260,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
                     else:
                         print ("instruction '%s' is missing inames '%s'"
                                 % (insn.id, ",".join(
-                                    (insn.all_inames() - parallel_inames)
+                                    (kernel.insn_inames(insn) - parallel_inames)
                                     -
                                     (outer_active_inames - parallel_inames))))
 
@@ -266,7 +269,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
             # the exactly correct set of loops.
 
             schedule_now = schedule_now and (
-                    insn.all_inames() - parallel_inames
+                    kernel.insn_inames(insn) - parallel_inames
                     ==
                     active_inames_set - parallel_inames)
 
@@ -289,7 +292,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
 
     # {{{ see if any loop can be entered now
 
-    available_loops = (kernel.all_inames()
+    available_loops = (kernel.all_referenced_inames()
             # loops can only be entered once
             - entered_inames
             # there's no notion of 'entering' a parallel loop
@@ -306,8 +309,11 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
 
             hypothetical_active_loops = active_inames_set | set([iname])
             for insn_id in unscheduled_insn_ids:
+                if insn_id not in insns_with_satisfied_deps:
+                    continue
+
                 insn = kernel.id_to_insn[insn_id]
-                if hypothetical_active_loops <= insn.all_inames():
+                if hypothetical_active_loops <= kernel.insn_inames(insn):
                     useful = True
                     break
 
@@ -365,7 +371,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]):
         can_leave = True
         for insn_id in unscheduled_insn_ids:
             insn = kernel.id_to_insn[insn_id]
-            if last_entered_loop in insn.all_inames():
+            if last_entered_loop in kernel.insn_inames(insn):
                 can_leave = False
                 break
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 6ecf02360..21ff0bd16 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -2,6 +2,8 @@
 
 from __future__ import division
 
+from pytools import memoize, memoize_method
+
 from pymbolic.primitives import AlgebraicLeaf
 from pymbolic.mapper import (
         CombineMapper as CombineMapperBase,
@@ -55,6 +57,16 @@ class Reduction(AlgebraicLeaf):
     def stringifier(self):
         return StringifyMapper
 
+    @property
+    @memoize_method
+    def untagged_inames(self):
+        return tuple(iname.lstrip("@") for iname in self.inames)
+
+    @property
+    @memoize_method
+    def untagged_inames_set(self):
+        return set(self.untagged_inames)
+
     mapper_method = intern("map_reduction")
 
 # }}}
@@ -82,7 +94,9 @@ class StringifyMapper(StringifyMapperBase):
 
 class DependencyMapper(DependencyMapperBase):
     def map_reduction(self, expr):
-        return self.rec(expr.expr)
+        from pymbolic.primitives import Variable
+        return (self.rec(expr.expr)
+                - set(Variable(iname) for iname in expr.untagged_inames))
 
 class BidirectionalUnifier(BidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
@@ -552,14 +566,13 @@ class IndexVariableFinder(CombineMapper):
     def map_reduction(self, expr):
         result = self.rec(expr.expr)
 
-        real_inames = set(iname.lstrip("@") for iname in expr.inames)
-        if not (real_inames & result):
+        if not (expr.untagged_inames_set & result):
             raise RuntimeError("reduction '%s' does not depend on "
                     "reduction inames (%s)" % (expr, ",".join(expr.inames)))
         if self.include_reduction_inames:
             return result
         else:
-            return result - real_inames
+            return result - expr.untagged_inames_set
 
 # }}}
 
@@ -645,6 +658,7 @@ class PrimeAdder(IdentityMapper):
 
 # }}}
 
+@memoize
 def get_dependencies(expr):
     from loopy.symbolic import DependencyMapper
     dep_mapper = DependencyMapper(composite_leaves=False)
diff --git a/test/test_sem.py b/test/test_sem.py
index 6b3ac68b8..7212860d0 100644
--- a/test/test_sem.py
+++ b/test/test_sem.py
@@ -304,21 +304,25 @@ def test_sem_3d(ctx_factory):
             name="semlap", assumptions="K>=1")
 
 
-    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
-    knl = lp.add_prefetch(knl, "D", ["m", "j"])
-    knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[e,i,j,k]")
-    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
-    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
-    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
-
-    seq_knl = knl
-    print seq_knl
-    #print lp.preprocess_kernel(seq_knl)
-    1/0
+    def add_pf(knl):
+        knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
+        knl = lp.add_prefetch(knl, "D", ["m", "j"])
+        knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[e,i,j,k]")
+        knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
+        knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
+        knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])
+
+    seq_knl = add_pf(knl)
 
     knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
     #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp")
 
+    knl = add_pf(knl)
+    #print seq_knl
+    #print lp.preprocess_kernel(seq_knl)
+    #1/0
+
+
     knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1"))
 
     kernel_gen = lp.generate_loop_schedules(knl,
-- 
GitLab