From 7955a50d3730e5151572de0763db4a50aaf5889c Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 13 Aug 2012 00:50:18 -0400
Subject: [PATCH] More work on multi-domain.

---
 MEMO                     |  7 ++--
 loopy/check.py           | 25 +++++++++++-
 loopy/codegen/bounds.py  | 10 +++--
 loopy/codegen/control.py | 25 ++++++++----
 loopy/codegen/loop.py    |  4 +-
 loopy/kernel.py          | 51 ++++++++++++++++++++----
 test/test_loopy.py       | 83 ++++++++++++++++++++++++++++++++++++++--
 7 files changed, 175 insertions(+), 30 deletions(-)

diff --git a/MEMO b/MEMO
index b03133efe..cd91b7754 100644
--- a/MEMO
+++ b/MEMO
@@ -47,9 +47,6 @@ To-do
   increase sched. scalability
 
 - Multi-domain
-  - Incorporate loop-bound-mediated iname dependencies into domain
-    parenthood.
-
   - Reenable codegen sanity check.
 
 - Kernel splitting (via what variables get computed in a kernel)
@@ -65,7 +62,6 @@ To-do
 
 - Scalar insn priority
 
-
 - If finding a maximum proves troublesome, move parameters into the domain
 
 - : (as in, Matlab full-slice) in prefetches
@@ -126,6 +122,9 @@ Dealt with
   -> dealt with by type contexts
 
 - relating to Multi-Domain
+  - Incorporate loop-bound-mediated iname dependencies into domain
+    parenthood. [DONE]
+
   - Make sure that variables that enter into loop bounds are only written
     exactly once. [DONE]
 
diff --git a/loopy/check.py b/loopy/check.py
index 148595636..17df483b4 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -1,4 +1,5 @@
 from __future__ import division
+from islpy import dim_type
 
 
 
@@ -85,7 +86,7 @@ def check_for_inactive_iname_access(kernel):
 
 def check_for_write_races(kernel):
     from loopy.symbolic import DependencyMapper
-    from loopy.kernel import ParallelTag, GroupIndexTag, IlpBaseTag
+    from loopy.kernel import ParallelTag, GroupIndexTag
     depmap = DependencyMapper()
 
     for insn in kernel.instructions:
@@ -173,6 +174,26 @@ def check_for_orphaned_user_hardware_axes(kernel):
             raise RuntimeError("user-requested local hardware axis %d "
                     "has no iname mapped to it" % axis)
 
+def check_for_data_dependent_parallel_bounds(kernel):
+    from loopy.kernel import ParallelTag
+
+    for i, dom in enumerate(kernel.domains):
+        dom_inames = set(dom.get_var_names(dim_type.set))
+        par_inames = set(iname
+                for iname in dom_inames
+                if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+
+        if not par_inames:
+            continue
+
+        parameters = set(dom.get_var_names(dim_type.param))
+        for par in parameters:
+            if par in kernel.temporary_variables:
+                raise RuntimeError("Domain number %d has a data-dependent "
+                        "parameter '%s' and contains parallel "
+                        "inames '%s'. This is not allowed (for now)."
+                        % (i, par, ", ".join(par_inames)))
+
 # }}}
 
 def run_automatic_checks(kernel):
@@ -181,7 +202,7 @@ def run_automatic_checks(kernel):
     check_for_unused_hw_axes_in_insns(kernel)
     check_for_inactive_iname_access(kernel)
     check_for_write_races(kernel)
-
+    check_for_data_dependent_parallel_bounds(kernel)
 
 # {{{ sanity-check for implemented domains of each instruction
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 1ec320bef..c9a9b8660 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -104,7 +104,7 @@ def constraint_to_code(ccm, cns):
         comp_op = ">="
 
     from loopy.symbolic import constraint_to_expr
-    return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op)
+    return "%s %s 0" % (ccm(constraint_to_expr(cns), 'i'), comp_op)
 
 def filter_necessary_constraints(implemented_domain, constraints):
     return [cns
@@ -131,8 +131,10 @@ def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt):
             domain, check_inames,
             implemented_domain)
 
-    new_implemented_domain = implemented_domain & (
-            isl.Set.universe(domain.get_space()).add_constraints(bounds_checks))
+    bounds_check_set = isl.Set.universe(domain.get_space()).add_constraints(bounds_checks)
+    bounds_check_set, new_implemented_domain = isl.align_two(
+            bounds_check_set, implemented_domain)
+    new_implemented_domain = new_implemented_domain & bounds_check_set
 
     condition_codelets = [
             constraint_to_code(ccm, cns) for cns in
@@ -190,7 +192,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt):
         from cgen import Initializer, POD, Const, Line
         return gen_code_block([
             Initializer(Const(POD(np.int32, iname)),
-                ccm(equality_expr)),
+                ccm(equality_expr, 'i')),
             Line(),
             stmt,
             ])
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 1bd89ed69..79c86a181 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -120,7 +120,9 @@ def build_loop_nest(kernel, sched_index, codegen_state):
     from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
             gather_schedule_subloop)
 
-    # {{{ pass 1: pre-scan schedule for my schedule items' indices
+    # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
+
+    # i.e. go up to the next LeaveLoop, and skip over inner loops.
 
     my_sched_indices = []
 
@@ -146,7 +148,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     # }}}
 
-    # {{{ pass 2: find admissible conditional inames for each schedule item
+    # {{{ pass 2: find admissible conditional inames for each sibling schedule item
 
     admissible_cond_inames = [
             get_admissible_conditional_inames_for(kernel, sched_index)
@@ -232,14 +234,21 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         # pick largest such group
         group_length, bounds_checks = max(found_hoists)
 
-        if bounds_checks:
-            check_set = isl.BasicSet.universe(kernel.space)
-            for cns in bounds_checks:
-                check_set = check_set.add_constraint(cns)
+        check_set = None
+        for cns in bounds_checks:
+            cns_set = (isl.BasicSet.universe(cns.get_space())
+                    .add_constraint(cns))
 
-            new_codegen_state = codegen_state.intersect(check_set)
-        else:
+            if check_set is None:
+                check_set = cns_set
+            else:
+                check_set, cns_set = isl.align_two(check_set, cns_set)
+                check_set = check_set.intersect(cns_set)
+
+        if check_set is None:
             new_codegen_state = codegen_state
+        else:
+            new_codegen_state = codegen_state.intersect(check_set)
 
         if group_length == 1:
             # group only contains starting schedule item
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index d37a720ca..36a24d5c5 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -223,9 +223,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=
         if len(slabs) == 1:
             cmt = None
 
-        new_kernel = kernel.copy(domain=domain & slab)
+        new_codegen_state = codegen_state.intersect(slab)
         inner = set_up_hw_parallel_loops(
-                new_kernel, sched_index, codegen_state, hw_inames_left)
+                kernel, sched_index, new_codegen_state, hw_inames_left)
         result.append(add_comment(cmt, inner))
 
     from loopy.codegen import gen_code_block
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 36575b103..0c9714001 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -848,8 +848,14 @@ class LoopKernel(Record):
         # {{{ process assumptions
 
         if assumptions is None:
-            assumptions_space = domains[0].get_space()
+            dom0_space = domains[0].get_space()
+            assumptions_space = isl.Space.params_alloc(
+                    dom0_space.get_ctx(), dom0_space.dim(dim_type.param))
+            for i in xrange(dom0_space.dim(dim_type.param)):
+                assumptions_space = assumptions_space.set_dim_name(
+                        dim_type.param, i, dom0_space.get_dim_name(dim_type.param, i))
             assumptions = isl.Set.universe(assumptions_space)
+
         elif isinstance(assumptions, str):
             all_inames = set()
             all_params = set()
@@ -865,6 +871,8 @@ class LoopKernel(Record):
             assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(),
                     assumptions_set_str)
 
+        assert assumptions.is_params()
+
         # }}}
 
         Record.__init__(self,
@@ -955,6 +963,8 @@ class LoopKernel(Record):
         iname_set_stack = []
         result = []
 
+        writer_map = self.writer_map()
+
         for dom in self.domains:
             parameters = set(dom.get_var_names(dim_type.param))
             inames = set(dom.get_var_names(dim_type.set))
@@ -966,12 +976,38 @@ class LoopKernel(Record):
 
             discard_level_count = 0
             while discard_level_count < len(iname_set_stack):
-                last_inames = iname_set_stack[-1-discard_level_count]
+                # {{{ check for parenthood by loop bound iname
 
+                last_inames = iname_set_stack[-1-discard_level_count]
                 if last_inames & parameters:
                     break
-                else:
-                    discard_level_count += 1
+
+                # }}}
+
+                # {{{ check for parenthood by written variable
+
+                is_parent_by_variable = False
+                for par in parameters:
+                    if par in self.temporary_variables:
+                        writer_insns = writer_map[par]
+
+                        if len(writer_insns) > 1:
+                            raise RuntimeError("loop bound '%s' "
+                                    "may only be written to once" % par)
+
+                        writer_insn, = writer_insns
+                        writer_inames = self.insn_inames(writer_insn)
+
+                        if writer_inames & last_inames:
+                            is_parent_by_variable = True
+                            break
+
+                if is_parent_by_variable:
+                    break
+
+                # }}}
+
+                discard_level_count += 1
 
             if discard_level_count:
                 iname_set_stack = iname_set_stack[:-discard_level_count]
@@ -1234,9 +1270,10 @@ class LoopKernel(Record):
         domain = self.get_inames_domain(frozenset([iname]))
         d_var_dict = domain.get_var_dict()
 
-        dom_intersect_assumptions = (
-                isl.align_spaces(self.assumptions, domain, obj_bigger_ok=True)
+        dom_intersect_assumptions = (isl.align_spaces(
+                self.assumptions, domain, obj_bigger_ok=True)
                 & domain)
+
         lower_bound_pw_aff = (
                 self.cache_manager.dim_min(
                     dom_intersect_assumptions,
@@ -1252,7 +1289,7 @@ class LoopKernel(Record):
             pass
 
         size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
-        size = size.intersect_domain(self.assumptions)
+        size = size.gist(self.assumptions)
 
         return BoundsRecord(
                 lower_bound_pw_aff=lower_bound_pw_aff,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 37d07f04e..17ef72f9a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -399,6 +399,11 @@ def test_dependent_loop_bounds_2(ctx_factory):
 
 
 def test_dependent_loop_bounds_3(ctx_factory):
+    # The point of this test is that it shows a dependency between
+    # domains that is exclusively mediated by the row_len temporary.
+    # It also makes sure that row_len gets read before any
+    # conditionals use it.
+
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
 
@@ -409,7 +414,7 @@ def test_dependent_loop_bounds_3(ctx_factory):
                 ],
             [
                 "<> row_len = a_row_lengths[i]",
-                "a[i,j] = 1",
+                "a[i,jj] = 1",
                 ],
             [
                 lp.GlobalArg("a_row_lengths", np.int32),
@@ -417,15 +422,87 @@ def test_dependent_loop_bounds_3(ctx_factory):
                 lp.ScalarArg("n", np.int32),
                 ])
 
+    assert knl.parents_per_domain()[1] == 0
+
     knl = lp.split_dimension(knl, "i", 128, outer_tag="g.0",
             inner_tag="l.0")
-    knl = lp.split_dimension(knl, "j", 128, outer_tag="g.1",
-            inner_tag="l.1")
+
     cknl = lp.CompiledKernel(ctx, knl)
     print "---------------------------------------------------"
     cknl.print_code()
     print "---------------------------------------------------"
 
+    knl_bad = lp.split_dimension(knl, "jj", 128, outer_tag="g.1",
+            inner_tag="l.1")
+
+    import pytest
+    with pytest.raises(RuntimeError):
+        list(lp.generate_loop_schedules(knl_bad))
+
+
+
+
+def test_independent_multi_domains(ctx_factory):
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(ctx.devices[0],
+            [
+                "{[i]: 0<=i<n}",
+                "{[j]: 0<=j<n}",
+                ],
+            [
+                "a[i,j] = 1",
+                ],
+            [
+                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
+                lp.ScalarArg("n", np.int32),
+                ])
+
+
+    knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0",
+            inner_tag="l.0")
+    knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1",
+            inner_tag="l.1")
+    assert knl.parents_per_domain() == 2*[None]
+
+    n = 50
+    cknl = lp.CompiledKernel(ctx, knl)
+    evt, (a,) = cknl(queue, n=n, out_host=True)
+
+    assert a.shape == (50, 50)
+    assert (a == 1).all()
+
+
+
+
+
+def test_bare_data_dependency(ctx_factory):
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(ctx.devices[0],
+            [
+                "[znirp] -> {[i]: 0<=i<znirp}",
+                ],
+            [
+                "<> znirp = n",
+                "a[i] = 1",
+                ],
+            [
+                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
+                lp.ScalarArg("n", np.int32),
+                ])
+
+    cknl = lp.CompiledKernel(ctx, knl)
+    n = 20000
+    evt, (a,) = cknl(queue, n=n, out_host=True)
+
+    assert a.shape == (n,)
+    assert (a == 1).all()
+
 
 
 
-- 
GitLab