Implement reduction iname uniquification.

426c0319 · Andreas Klöckner · 020203ae · 426c0319 · 426c0319 · 426c0319
Commit 426c0319 authored 13 years ago by Andreas Klöckner
--- a/MEMO
+++ b/MEMO
@@ -34,6 +34,8 @@ Things to consider
 - Measure efficiency of corner cases
+- Loopy as a data model for implementing custom rewritings
 To-do
 ^^^^^
@@ -45,12 +47,18 @@ To-do
 - user interface for dim length prescription
+- Way too many barriers in SEM test.
 - Deal with equality constraints.
  (These arise, e.g., when partitioning a loop of length 16 into 16s.)
 Future ideas
 ^^^^^^^^^^^^
+- Float4 joining on fetch/store?
+- How can one automatically generate something like microblocks?
 - Better for loop bound generation
  -> Try a triangular loop

--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -699,7 +699,8 @@ class LoopKernel(Record):
        if all_inames_by_insns != self.all_inames():
            raise RuntimeError("inames collected from instructions (%s) "
                    "do not match domain inames (%s)"
-                    % (", ".join(all_inames_by_insns), ", ".join(self.all_inames())))
+                    % (", ".join(sorted(all_inames_by_insns)), 
+                        ", ".join(sorted(self.all_inames()))))
        global_sizes = {}
        local_sizes = {}
@@ -780,11 +781,13 @@ class LoopKernel(Record):
    def __str__(self):
        lines = []
-        for insn in self.instructions:
-            lines.append(str(insn))
-        lines.append("")
        for iname in sorted(self.all_inames()):
            lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname)))
+        lines.append("")
+        lines.append(str(self.domain))
+        lines.append("")
+        for insn in self.instructions:
+            lines.append(str(insn))
        return "\n".join(lines)

--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -6,6 +6,86 @@ import pyopencl.characterize as cl_char
+# {{{ make reduction variables unique
+def make_reduction_variables_unique(kernel):
+    # {{{ count number of uses of each reduction iname
+    def count_reduction_iname_uses(expr, rec):
+        rec(expr.expr)
+        for iname in expr.inames:
+            reduction_iname_uses[iname] = (
+                    reduction_iname_uses.get(iname, 0)
+                    + 1)
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(count_reduction_iname_uses)
+    reduction_iname_uses = {}
+    for insn in kernel.instructions:
+        cb_mapper(insn.expression)
+    # }}}
+    # {{{ make iname uses in reduction unique
+    def ensure_reduction_iname_uniqueness(expr, rec):
+        child = rec(expr.expr)
+        my_created_inames = []
+        new_red_inames = []
+        for iname in expr.inames:
+            if reduction_iname_uses[iname] > 1:
+                new_iname = kernel.make_unique_var_name(iname, set(new_inames))
+                old_inames.append(iname)
+                new_inames.append(new_iname)
+                my_created_inames.append(new_iname)
+                new_red_inames.append(new_iname)
+                reduction_iname_uses[iname] -= 1
+            else:
+                new_red_inames.append(iname)
+        if my_created_inames:
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+            from pymbolic import var
+            subst_dict = dict(
+                    (old_iname, var(new_iname))
+                    for old_iname, new_iname in zip(expr.inames, my_created_inames))
+            subst_map = SubstitutionMapper(make_subst_func(subst_dict))
+            child = subst_map(child)
+        from loopy.symbolic import Reduction
+        return Reduction(
+                operation=expr.operation,
+                inames=tuple(new_red_inames),
+                expr=child)
+    new_insns = []
+    old_inames = []
+    new_inames = []
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(ensure_reduction_iname_uniqueness)
+    new_insns = [
+        insn.copy(expression=cb_mapper(insn.expression))
+        for insn in kernel.instructions]
+    domain = kernel.domain
+    from loopy.isl_helpers import duplicate_axes
+    for old, new in zip(old_inames, new_inames):
+        domain = duplicate_axes(domain, [old], [new])
+    return kernel.copy(instructions=new_insns, domain=domain)
+    # }}}
+# }}}
 # {{{ rewrite reduction to imperative form
 def realize_reduction(kernel):
@@ -466,6 +546,7 @@ def adjust_local_temp_var_storage(kernel):
 def preprocess_kernel(kernel):
+    kernel = make_reduction_variables_unique(kernel)
    kernel = realize_reduction(kernel)
    # {{{ check that all CSEs have been realized

--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -514,7 +514,7 @@ class IndexVariableFinder(CombineMapper):
        result = self.rec(expr.expr)
        if not (set(expr.inames) & result):
            raise RuntimeError("reduction '%s' does not depend on "
-                    "reduction inames" % expr)
+                    "reduction inames (%s)" % (expr, ",".join(expr.inames)))
        if self.include_reduction_inames:
            return result
        else:

--- a/test/test_sem.py
+++ b/test/test_sem.py
@@ -4,7 +4,6 @@ import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
 import pyopencl.array as cl_array
-import pyopencl.clrandom as cl_random
 import loopy as lp
 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -287,16 +286,16 @@ def test_sem_3d(ctx_factory):
    # K - run-time symbolic
    n = 8
    knl = lp.make_kernel(ctx.devices[0],
-            "[K] -> {[i,j,k,e,m,mp]: 0<=i,j,k,m<%d and 0<=e<K}" % n,
+            "[K] -> {[i,j,k,e,m]: 0<=i,j,k,m<%d and 0<=e<K}" % n,
            [
                "[|i,j,k] <float32> ur[i,j,k] = sum_float32(m, D[i,m]*u[m,j,k,e])",
                "[|i,j,k] <float32> us[i,j,k] = sum_float32(m, D[j,m]*u[i,m,k,e])",
                "[|i,j,k] <float32> ut[i,j,k] = sum_float32(m, D[k,m]*u[i,j,m,e])",
                "lap[i,j,k,e]  = "
-                "  sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k,e] + G[1,m,j,k,e]*us[m,j,k,e] + G[2,m,j,k,e]*ut[m,j,k,e]))"
+                "  sum_float32(m, D[m,i]*(G[0,m,j,k,e]*ur[m,j,k] + G[1,m,j,k,e]*us[m,j,k] + G[2,m,j,k,e]*ut[m,j,k]))"
-                "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k,e] + G[3,i,m,k,e]*us[i,m,k,e] + G[4,i,m,k,e]*ut[i,m,k,e]))"
+                "+ sum_float32(m, D[m,j]*(G[1,i,m,k,e]*ur[i,m,k] + G[3,i,m,k,e]*us[i,m,k] + G[4,i,m,k,e]*ut[i,m,k]))"
-                "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m,e] + G[4,i,j,m,e]*us[i,j,m,e] + G[5,i,j,m,e]*ut[i,j,m,e]))"
+                "+ sum_float32(m, D[m,k]*(G[2,i,j,m,e]*ur[i,j,m] + G[4,i,j,m,e]*us[i,j,m] + G[5,i,j,m,e]*ut[i,j,m]))"
                ],
            [
            lp.ArrayArg("u",   dtype, shape=field_shape, order=order),
@@ -307,17 +306,18 @@ def test_sem_3d(ctx_factory):
            ],
            name="semlap", assumptions="K>=1")
-    print knl
+    #print knl
    #for tv in knl.temporary_variables.iteritems():
        #print tv
-    1/0
+    #1/0
    knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp")
    knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1"))
    #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"])
-    knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"])
+    #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"])
-    print knl
+    knl = lp.preprocess_kernel(knl)
+    #print knl
    #1/0
    kernel_gen = lp.generate_loop_schedules(knl)
@@ -343,10 +343,6 @@ def test_sem_3d(ctx_factory):
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the
-    # tests.
-    import pyopencl as cl
    import sys
    if len(sys.argv) > 1:
        exec(sys.argv[1])