From bbe3669296c2ecd9121b37efd90618eb58d92293 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 5 Jun 2016 12:56:06 -0500
Subject: [PATCH] Take accessed vars into account when determining default
 inames of precompute instructions

---
 loopy/kernel/tools.py         | 86 +++++++++++++++++++++--------------
 loopy/transform/precompute.py | 33 ++++++++++++--
 2 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 1775032cb..a4e6ab0d6 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -125,7 +125,41 @@ def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
 # }}}
 
 
-# {{{ find_all_insn_inames fixed point iteration
+# {{{ find_all_insn_inames fixed point iteration (deprecated)
+
+def guess_iname_deps_based_on_var_use(kernel, insn, insn_id_to_inames=None):
+    # For all variables that insn depends on, find the intersection
+    # of iname deps of all writers, and add those to insn's
+    # dependencies.
+
+    result = frozenset()
+
+    writer_map = kernel.writer_map()
+
+    for tv_name in (insn.read_dependency_names() & kernel.get_written_variables()):
+        tv_implicit_inames = None
+
+        for writer_id in writer_map[tv_name]:
+            writer_insn = kernel.id_to_insn[writer_id]
+            if insn_id_to_inames is None:
+                writer_inames = writer_insn.forced_iname_deps
+            else:
+                writer_inames = insn_id_to_inames[writer_id]
+
+            writer_implicit_inames = (
+                    writer_inames
+                    - (writer_insn.write_dependency_names() & kernel.all_inames()))
+            if tv_implicit_inames is None:
+                tv_implicit_inames = writer_implicit_inames
+            else:
+                tv_implicit_inames = (tv_implicit_inames
+                        & writer_implicit_inames)
+
+        if tv_implicit_inames is not None:
+            result = result | tv_implicit_inames
+
+    return result - insn.reduction_inames()
+
 
 def find_all_insn_inames(kernel):
     logger.debug("%s: find_all_insn_inames: start" % kernel.name)
@@ -166,8 +200,6 @@ def find_all_insn_inames(kernel):
         insn_id_to_inames[insn.id] = iname_deps
         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
 
-    written_vars = kernel.get_written_variables()
-
     # fixed point iteration until all iname dep sets have converged
 
     # Why is fixed point iteration necessary here? Consider the following
@@ -190,38 +222,22 @@ def find_all_insn_inames(kernel):
 
             # {{{ depdency-based propagation
 
-            # For all variables that insn depends on, find the intersection
-            # of iname deps of all writers, and add those to insn's
-            # dependencies.
-
-            for tv_name in (all_read_deps[insn.id] & written_vars):
-                implicit_inames = None
-
-                for writer_id in writer_map[tv_name]:
-                    writer_implicit_inames = (
-                            insn_id_to_inames[writer_id]
-                            - insn_assignee_inames[writer_id])
-                    if implicit_inames is None:
-                        implicit_inames = writer_implicit_inames
-                    else:
-                        implicit_inames = (implicit_inames
-                                & writer_implicit_inames)
-
-                inames_old = insn_id_to_inames[insn.id]
-                inames_new = (inames_old | implicit_inames) \
-                            - insn.reduction_inames()
-                insn_id_to_inames[insn.id] = inames_new
-
-                if inames_new != inames_old:
-                    did_something = True
-
-                    warn_with_kernel(kernel, "inferred_iname",
-                            "The iname(s) '%s' on instruction '%s' was "
-                            "automatically added. "
-                            "This is deprecated. Please add the iname "
-                            "to the instruction "
-                            "implicitly, e.g. by adding '{inames=...}"
-                            % (inames_new-inames_old, insn.id))
+            inames_old = insn_id_to_inames[insn.id]
+            inames_new = inames_old | guess_iname_deps_based_on_var_use(
+                    kernel, insn, insn_id_to_inames)
+
+            insn_id_to_inames[insn.id] = inames_new
+
+            if inames_new != inames_old:
+                did_something = True
+
+                warn_with_kernel(kernel, "inferred_iname",
+                        "The iname(s) '%s' on instruction '%s' was "
+                        "automatically added. "
+                        "This is deprecated. Please add the iname "
+                        "to the instruction "
+                        "implicitly, e.g. by adding '{inames=...}"
+                        % (inames_new-inames_old, insn.id))
 
             # }}}
 
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index f9d71b9f1..ce54bb54c 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -239,6 +239,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, temporary_name=None, precompute_inames=None,
+        precompute_outer_inames=None,
         storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
         fetch_bounding_box=False,
         temporary_scope=None, temporary_is_local=None,
@@ -307,6 +308,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         tuple, in which case names will be automatically created.
         May also equivalently be a comma-separated string.
 
+    :arg precompute_outer_inames: The inames within which the compute
+        instruction is nested. If *None*, guess from dependencies.
+
     :arg compute_insn_id: The ID of the instruction performing the precomputation.
 
     If `storage_axes` is not specified, it defaults to the arrangement
@@ -766,11 +770,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             id=compute_insn_id,
             assignee=assignee,
             expression=compute_expression,
-            forced_iname_deps=(
-                frozenset(non1_storage_axis_names)
-                | frozenset(
-                    (expanding_usage_arg_deps | value_inames)
-                    - sweep_inames_set))
+            # forced_iname_deps determined below
             )
 
     # }}}
@@ -791,6 +791,29 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
+    # {{{ determine inames for compute insn
+
+    if precompute_outer_inames is None:
+        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
+        precompute_outer_inames = (
+                    frozenset(non1_storage_axis_names)
+                    | frozenset(
+                        (expanding_usage_arg_deps | value_inames)
+                        - sweep_inames_set)
+                    | guess_iname_deps_based_on_var_use(kernel, compute_insn))
+    else:
+        if not isinstance(precompute_outer_inames, frozenset):
+            raise TypeError("precompute_outer_inames must be a frozenset")
+
+    kernel = kernel.copy(
+            instructions=[
+                insn.copy(forced_iname_deps=precompute_outer_inames)
+                if insn.id == compute_insn_id
+                else insn
+                for insn in kernel.instructions])
+
+    # }}}
+
     # {{{ set up temp variable
 
     import loopy as lp
-- 
GitLab