From 6f16409112310404e167f84d7fdd52143161aae1 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 23 Aug 2015 14:30:24 -0500
Subject: [PATCH] Fix nesting safety of loop_insn_dep_map

---
 loopy/schedule.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/loopy/schedule.py b/loopy/schedule.py
index 55a1c6aab..1d0dc1221 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -171,7 +171,7 @@ def find_used_inames_within(kernel, sched_index):
     return result
 
 
-def loop_nest_map(kernel):
+def find_loop_nest_map(kernel):
     """Returns a dictionary mapping inames to other inames that are
     always nested around them.
     """
@@ -212,7 +212,7 @@ def loop_nest_map(kernel):
     return result
 
 
-def loop_insn_dep_map(kernel):
+def find_loop_insn_dep_map(kernel, loop_nest_map):
     """Returns a dictionary mapping inames to other instruction ids that need to
     be scheduled before the iname should be eligible for scheduling.
     """
@@ -230,10 +230,27 @@ def loop_insn_dep_map(kernel):
                 dep_insn_inames = kernel.insn_inames(dep_insn)
 
                 if iname in dep_insn_inames:
-                    # Nothing to be learened, dependency is in loop over iname.
+                    # Nothing to be learned, dependency is in loop over iname
+                    # already.
                     continue
 
-                result.setdefault(iname, set()).add(dep_insn_id)
+                # To make sure dep_insn belongs outside of iname, we must prove
+                # (via loop_nest_map) that all inames that dep_insn will be
+                # executed inside are nested *around* iname.
+                if not dep_insn_inames <= loop_nest_map[iname]:
+                    continue
+
+                iname_dep = result.setdefault(iname, set())
+                if dep_insn_id not in iname_dep:
+                    logger.debug("{knl}: loop dependency map: iname '{iname}' "
+                            "depends on '{dep_insn}' via '{insn}'"
+                            .format(
+                                knl=kernel.name,
+                                iname=iname,
+                                dep_insn=dep_insn_id,
+                                insn=insn.id))
+
+                    iname_dep.add(dep_insn_id)
 
     return result
 
@@ -1226,10 +1243,11 @@ def generate_loop_schedules(kernel, debug_args={}):
             iname for iname in kernel.all_inames()
             if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
 
+    loop_nest_map = find_loop_nest_map(kernel)
     sched_state = SchedulerState(
             kernel=kernel,
-            loop_nest_map=loop_nest_map(kernel),
-            loop_insn_dep_map=loop_insn_dep_map(kernel),
+            loop_nest_map=loop_nest_map,
+            loop_insn_dep_map=find_loop_insn_dep_map(kernel, loop_nest_map),
             breakable_inames=ilp_inames,
             ilp_inames=ilp_inames,
             vec_inames=vec_inames,
-- 
GitLab