From d6de541089db127346e3fc90aeae69c70ead6833 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 23 Sep 2012 12:55:30 -0500
Subject: [PATCH] Add index to private variables for ILP.

---
 MEMO                |   3 -
 loopy/kernel.py     |   6 +-
 loopy/preprocess.py | 142 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 116 insertions(+), 35 deletions(-)

diff --git a/MEMO b/MEMO
index f29981a08..09245716d 100644
--- a/MEMO
+++ b/MEMO
@@ -72,13 +72,10 @@ Fixes:
 Future ideas
 ^^^^^^^^^^^^
 
-<<<<<<< HEAD
 - Put all OpenCL functions into mangler
 
 - Fuse: store/fetch elimination?
 
-=======
->>>>>>> d0f46221e2249d7894aed2d5e7ab21e84c419eac
 - Expose iname-duplicate-and-rename as a primitive.
 
 - Array language
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 835f5abf9..d68fb938a 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -1340,10 +1340,10 @@ class LoopKernel(Record):
         return result
 
     def insn_inames(self, insn):
-        if isinstance(insn, str):
-            return self.all_insn_inames()[insn]
-        else:
+        if isinstance(insn, Instruction):
             return self.all_insn_inames()[insn.id]
+        else:
+            return self.all_insn_inames()[insn]
 
     @memoize_method
     def iname_to_insns(self):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f7d6f25d4..c9fa85cf7 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -194,8 +194,6 @@ def realize_reduction(kernel, insn_id_filter=None):
     new_insns = []
     new_temporary_variables = kernel.temporary_variables.copy()
 
-    from loopy.kernel import IlpBaseTag
-
     from loopy.codegen.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
@@ -203,38 +201,12 @@ def realize_reduction(kernel, insn_id_filter=None):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
-        # {{{ see if this reduction is nested inside some ILP loops
-
-        ilp_inames = [iname
-                for iname in temp_kernel.insn_inames(insn)
-                if isinstance(temp_kernel.iname_to_tag.get(iname), IlpBaseTag)]
-
-        from loopy.isl_helpers import static_max_of_pw_aff
-
-        ilp_iname_lengths = []
-        for iname in ilp_inames:
-            # Using the original kernel is ok here. Nothing in realize_reductions
-            # messes with inames. This is useful because it takes advantage
-            # of bounds caching.
-            bounds = kernel.get_iname_bounds(iname)
-
-            from loopy.symbolic import pw_aff_to_expr
-            ilp_iname_lengths.append(
-                    int(pw_aff_to_expr(
-                        static_max_of_pw_aff(bounds.size, constants_only=True))))
-
-        # }}}
-
         from pymbolic import var
 
         target_var_name = kernel.make_unique_var_name("acc_"+"_".join(expr.inames),
                 extra_used_vars=set(new_temporary_variables))
         target_var = var(target_var_name)
 
-        if ilp_inames:
-            target_var = target_var[
-                    tuple(var(ilp_iname) for ilp_iname in ilp_inames)]
-
         arg_dtype = type_inf_mapper(expr.expr)
 
         from loopy.kernel import Instruction
@@ -242,8 +214,8 @@ def realize_reduction(kernel, insn_id_filter=None):
         from loopy.kernel import TemporaryVariable
         new_temporary_variables[target_var_name] = TemporaryVariable(
                 name=target_var_name,
+                shape=(),
                 dtype=expr.operation.result_dtype(arg_dtype, expr.inames),
-                shape=tuple(ilp_iname_lengths),
                 is_local=False)
 
         new_id = temp_kernel.make_unique_instruction_id(
@@ -322,6 +294,111 @@ def realize_reduction(kernel, insn_id_filter=None):
 
 # }}}
 
+# {{{ duplicate private vars for ilp
+
+from loopy.symbolic import IdentityMapper
+
+class ExtraInameIndexInserter(IdentityMapper):
+    def __init__(self, var_to_new_inames):
+        self.var_to_new_inames = var_to_new_inames
+
+    def map_subscript(self, expr):
+        res = IdentityMapper.map_subscript(self, expr)
+        try:
+            new_idx = self.var_to_new_inames[expr.aggregate.name]
+        except KeyError:
+            return IdentityMapper.map_subscript(self, expr)
+        else:
+            return res.aggregate[res.index + new_idx]
+
+    def map_variable(self, expr):
+        try:
+            new_idx = self.var_to_new_inames[expr.name]
+        except KeyError:
+            return expr
+        else:
+            return expr[new_idx]
+
+def duplicate_private_temporaries_for_ilp(kernel):
+    wmap = kernel.writer_map()
+
+    from loopy.kernel import IlpBaseTag
+    from loopy.symbolic import get_dependencies
+
+    var_to_new_ilp_inames = {}
+
+    # {{{ find variables that need extra indices
+
+    for tv in kernel.temporary_variables.itervalues():
+        for writer_insn_id in wmap[tv.name]:
+            writer_insn = kernel.id_to_insn[writer_insn_id]
+            ilp_inames = frozenset(iname
+                    for iname in kernel.insn_inames(writer_insn)
+                    if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag))
+
+            referenced_ilp_inames = (ilp_inames
+                    & get_dependencies(writer_insn.assignee))
+
+            new_ilp_inames = ilp_inames - referenced_ilp_inames
+
+            if tv.name in var_to_new_ilp_inames:
+                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
+                    raise RuntimeError("instruction '%s' requires adding "
+                            "indices for ILP inames '%s', but previous "
+                            "instructions required inames'%s'"
+                            % (writer_insn_id, ", ".join(new_ilp_inames),
+                                ", ".join(var_to_new_ilp_inames[tv.name])))
+
+                continue
+
+            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)
+
+    # }}}
+
+    # {{{ find ilp iname lengths
+
+    from loopy.isl_helpers import static_max_of_pw_aff
+    from loopy.symbolic import pw_aff_to_expr
+
+    ilp_iname_to_length = {}
+    for ilp_inames in var_to_new_ilp_inames.itervalues():
+        for iname in ilp_inames:
+            if iname in ilp_iname_to_length:
+                continue
+
+            bounds = kernel.get_iname_bounds(iname)
+            ilp_iname_to_length[iname] = int(pw_aff_to_expr(
+                        static_max_of_pw_aff(bounds.size, constants_only=True)))
+
+            assert static_max_of_pw_aff(
+                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()
+
+    # }}}
+
+    # {{{ change temporary variables
+
+    new_temp_vars = kernel.temporary_variables.copy()
+    for tv_name, inames in var_to_new_ilp_inames.iteritems():
+        tv = new_temp_vars[tv_name]
+        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)
+
+        shape = tv.shape
+        if shape is None:
+            shape = ()
+
+        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape)
+
+    # }}}
+
+    from pymbolic import var
+    return (kernel
+            .copy(temporary_variables=new_temp_vars)
+            .map_expressions(ExtraInameIndexInserter(
+                dict((var_name, tuple(var(iname) for iname in inames))
+                    for var_name, inames in var_to_new_ilp_inames.iteritems()))))
+
+# }}}
+
 # {{{ automatic dependencies, find boostability of instructions
 
 def add_boostability_and_automatic_dependencies(kernel):
@@ -783,6 +860,13 @@ def preprocess_kernel(kernel):
 
     kernel = realize_reduction(kernel)
 
+    # Ordering restriction:
+    # duplicate_private_temporaries_for_ilp because reduction accumulators
+    # need to be duplicated by this.
+
+    kernel = duplicate_private_temporaries_for_ilp(kernel)
+    print kernel
+
     kernel = mark_local_temporaries(kernel)
     kernel = assign_automatic_axes(kernel)
     kernel = add_boostability_and_automatic_dependencies(kernel)
-- 
GitLab