From e606dd4639711a0e712fb7c247e28c563cf6e5b1 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 12 Oct 2011 18:31:52 -0400
Subject: [PATCH] First cut of conditional hoisting.

---
 MEMO                         |  27 ++++--
 loopy/codegen/__init__.py    |  46 +++++++--
 loopy/codegen/bounds.py      |  12 +--
 loopy/codegen/dispatch.py    | 182 +++++++++++++++++++++++++++++------
 loopy/codegen/instruction.py |  44 ++-------
 loopy/codegen/loop.py        | 132 ++++++++++++-------------
 loopy/isl.py                 |  20 +---
 loopy/kernel.py              |   6 +-
 loopy/schedule.py            |  45 +++++++--
 test/test_matmul.py          |   2 +-
 10 files changed, 333 insertions(+), 183 deletions(-)

diff --git a/MEMO b/MEMO
index 8eb2fd5ee..24ddf973c 100644
--- a/MEMO
+++ b/MEMO
@@ -49,6 +49,13 @@ Things to consider
 - Loop bounds currently may not depend on parallel dimensions
   Does it make sense to relax this?
 
+- Why do CSEs necessarily have to duplicate the inames?
+  -> because that would be necessary for a sequential prefetch
+
+- Cannot do slab decomposition on inames that share a tag with
+  other inames
+  -> Is that reasonable?
+
 TODO
 ^^^^
 
@@ -69,10 +76,6 @@ TODO
   -> Reduction
   -> CSEs?
 
-- Slab decomposition for parallel dimensions
-  - implement at the outermost nesting level regardless
-  - bound *all* tagged inames
-
 - Sharing of checks across ILP instances
 
 - Some things involving CSEs might be impossible to schedule
@@ -80,12 +83,24 @@ TODO
 
 - Flag, exploit idempotence
 
-- Implement insert_parallel_dim_check_points
-  (but first: find a kernel that needs it)
+- How should we implement the dim shuffling for odd-size prefetches?
+
+- Slab decomposition for ILP
+
+- Better for loop bound generation
+  -> Try a triangular loop
+
+- Implement condition hoisting
+  (needed, e.g., by slab decomposition)
 
 Dealt with
 ^^^^^^^^^^
 
+- Slab decomposition for parallel dimensions
+  - implement at the outermost nesting level regardless
+  - bound *all* tagged inames
+  - can't slab inames that share tags with other inames (for now)
+
 - Make syntax for iname dependencies
 
 - make syntax for insn dependencies
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index b15178dc0..f022e6098 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -127,6 +127,39 @@ class CodeGenerationState(object):
 
 # }}}
 
+# {{{ initial assignments
+
+def make_initial_assignments(kernel):
+    assignments = {}
+
+    global_size, local_size = kernel.get_grid_sizes()
+
+    from loopy.kernel import TAG_LOCAL_IDX, TAG_GROUP_IDX
+    from pymbolic import var
+
+    for iname in kernel.all_inames():
+        tag = kernel.iname_to_tag.get(iname)
+
+        if isinstance(tag, TAG_LOCAL_IDX):
+            hw_axis_expr = var("lid")(tag.axis)
+            hw_axis_size = local_size[tag.axis]
+
+        elif isinstance(tag, TAG_GROUP_IDX):
+            hw_axis_expr = var("gid")(tag.axis)
+            hw_axis_size = global_size[tag.axis]
+
+        else:
+            continue
+
+        bounds = kernel.get_iname_bounds(iname)
+
+        from loopy.symbolic import pw_aff_to_expr
+        assignments[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) + hw_axis_expr
+
+    return assignments
+
+# }}}
+
 # {{{ main code generation entrypoint
 
 def generate_code(kernel):
@@ -138,9 +171,8 @@ def generate_code(kernel):
             CLLocal, CLImage, CLConstant)
 
     from loopy.symbolic import LoopyCCodeMapper
-    ccm = LoopyCCodeMapper(kernel)
-
-    # {{{ build top-level
+    ccm = LoopyCCodeMapper(kernel).copy_and_assign_many(
+            make_initial_assignments(kernel))
 
     mod = Module()
 
@@ -244,12 +276,12 @@ def generate_code(kernel):
 
     # }}}
 
-    from loopy.codegen.dispatch import build_loop_nest
-
     from islpy import align_spaces
     initial_implemented_domain = align_spaces(kernel.assumptions, kernel.domain)
-    gen_code = build_loop_nest(kernel, 0,
-            CodeGenerationState(initial_implemented_domain, c_code_mapper=ccm))
+    codegen_state = CodeGenerationState(initial_implemented_domain, c_code_mapper=ccm)
+
+    from loopy.codegen.loop import set_up_hw_parallel_loops
+    gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)
 
     body.append(Line())
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 84f130e2d..a7a5cedee 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -143,23 +143,23 @@ def filter_necessary_constraints(implemented_domain, constraints):
         if not implemented_domain.is_subset(
             isl.Set.universe(space).add_constraint(cns))]
 
-def generate_bounds_checks(domain, check_vars, implemented_domain):
+def generate_bounds_checks(domain, check_inames, implemented_domain):
     domain_bset, = (domain
-            .eliminate_except(check_vars, [dim_type.set])
+            .eliminate_except(check_inames, [dim_type.set])
             .coalesce()
             .get_basic_sets())
 
     return filter_necessary_constraints(
             implemented_domain, domain_bset.get_constraints())
 
-def generate_bounds_checks_code(ccm, domain, check_vars, implemented_domain):
+def generate_bounds_checks_code(ccm, domain, check_inames, implemented_domain):
     return [constraint_to_code(ccm, cns) for cns in 
-            generate_bounds_checks(domain, check_vars, implemented_domain)]
+            generate_bounds_checks(domain, check_inames, implemented_domain)]
 
-def wrap_in_bounds_checks(ccm, domain, check_vars, implemented_domain, stmt):
+def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt):
     from loopy.codegen import wrap_in_if
     return wrap_in_if(
-            generate_bounds_checks_code(ccm, domain, check_vars,
+            generate_bounds_checks_code(ccm, domain, check_inames,
                 implemented_domain),
             stmt)
 
diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py
index 77b2f718e..9d1f0261a 100644
--- a/loopy/codegen/dispatch.py
+++ b/loopy/codegen/dispatch.py
@@ -2,6 +2,69 @@
 from __future__ import division
 
 from loopy.codegen import CodeGenerationState, gen_code_block
+import islpy as isl
+
+
+
+
+def get_admissible_conditional_inames_for(kernel, sched_index):
+    """This function disallows conditionals on local-idx tagged
+    inames if there is a barrier nested somewhere within.
+    """
+
+    from loopy.kernel import TAG_LOCAL_IDX, ParallelTag
+
+    from loopy.schedule import find_active_inames_at, has_barrier_within
+    result = find_active_inames_at(kernel, sched_index)
+
+    has_barrier = has_barrier_within(kernel, sched_index)
+
+    for iname, tag in kernel.iname_to_tag.iteritems():
+        if isinstance(tag, ParallelTag):
+            if not has_barrier or not isinstance(tag, TAG_LOCAL_IDX):
+                result.add(iname)
+
+    return result
+
+
+
+
+def generate_code_for_sched_index(kernel, sched_index, codegen_state):
+    from loopy.schedule import (EnterLoop, RunInstruction, Barrier)
+
+    sched_item = kernel.schedule[sched_index]
+
+    if isinstance(sched_item, EnterLoop):
+        tag = kernel.iname_to_tag[sched_item.iname]
+
+        from loopy.codegen.loop import (
+                generate_unroll_loop,
+                generate_sequential_loop_dim_code)
+
+        from loopy.kernel import TAG_UNROLL, SequentialTag
+        if isinstance(tag, TAG_UNROLL):
+            func = generate_unroll_loop
+        elif tag is None or isinstance(tag, SequentialTag):
+            func = generate_sequential_loop_dim_code
+        else:
+            raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'"
+                    % (sched_item.iname, tag))
+
+        return func(kernel, sched_index, codegen_state)
+
+    elif isinstance(sched_item, Barrier):
+        from cgen import Statement as S
+        return S("barrier(CLK_LOCAL_MEM_FENCE)")
+
+    elif isinstance(sched_item, RunInstruction):
+        insn = kernel.id_to_insn[sched_item.insn_id]
+
+        from loopy.codegen.instruction import generate_instruction_code
+        return generate_instruction_code(kernel, insn, codegen_state)
+
+    else:
+        raise RuntimeError("unexpected schedule item type: %s"
+                % type(sched_item))
 
 
 
@@ -11,9 +74,10 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
             gather_schedule_subloop)
-    from cgen import Statement as S
 
-    result = []
+    # {{{ pass 1: pre-scan schedule for my schedule items' indices
+
+    my_sched_indices = []
 
     while sched_index < len(kernel.schedule):
         sched_item = kernel.schedule[sched_index]
@@ -21,49 +85,105 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         if isinstance(sched_item, LeaveLoop):
             break
 
-        elif isinstance(sched_item, EnterLoop):
-            tag = kernel.iname_to_tag[sched_item.iname]
-
-            from loopy.codegen.loop import (
-                    generate_unroll_or_ilp_code,
-                    generate_parallel_loop_dim_code,
-                    generate_sequential_loop_dim_code)
-
-            from loopy.kernel import (TAG_UNROLL, TAG_ILP,
-                    ParallelTagWithAxis)
-            if isinstance(tag, (TAG_UNROLL, TAG_ILP)):
-                func = generate_unroll_or_ilp_code
-            elif isinstance(tag, ParallelTagWithAxis):
-                func = generate_parallel_loop_dim_code
-            else:
-                func = generate_sequential_loop_dim_code
-
-            result.append(func(kernel, sched_index, codegen_state))
+        my_sched_indices.append(sched_index)
 
+        if isinstance(sched_item, EnterLoop):
             _, sched_index = gather_schedule_subloop(
                     kernel.schedule, sched_index)
-
         elif isinstance(sched_item, Barrier):
-            result.append(S("barrier(CLK_LOCAL_MEM_FENCE)"))
-
             sched_index += 1
 
         elif isinstance(sched_item, RunInstruction):
-            insn = kernel.id_to_insn[sched_item.insn_id]
+            sched_index += 1
+        else:
+            raise RuntimeError("unexpected schedule item type: %s"
+                    % type(sched_item))
 
-            from loopy.codegen.instruction import generate_instruction_code
+    # }}}
 
-            result.append(
-                    generate_instruction_code(kernel, insn, codegen_state))
+    # {{{ pass 2: find admissible conditional inames
 
-            sched_index += 1
+    # FIXME: See if another inner insn relies on a different iname
+    # boiling down to the same tag. If so, exclude that.
+
+    admissible_cond_inames = [
+            get_admissible_conditional_inames_for(kernel, sched_index)
+            for sched_index in my_sched_indices]
+
+    # }}}
+
+    # {{{ pass 3: greedily group instructions that share admissible conditionals
+
+    def build_insn_group(sched_indices_and_cond_inames, codegen_state,
+            min_iname_count=1):
+        # min_iname_count serves to prevent infinite recursion by imposing a
+        # bigger and bigger minimum size on the group of shared inames found.
+
+        if not sched_indices_and_cond_inames:
+            return []
+
+        sched_index, cond_inames = sched_indices_and_cond_inames[0]
+
+        # {{{ keep growing instruction group as long as shared inames exist
+
+        current_iname_set = cond_inames
 
+        idx = 1
+        while (len(current_iname_set) >= min_iname_count
+                and idx < len(sched_indices_and_cond_inames)):
+            other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx]
+            new_iname_set = current_iname_set & other_cond_inames
+
+            if len(new_iname_set) >= min_iname_count:
+                idx += 1
+                current_iname_set = new_iname_set
+            else:
+                break
+
+        # }}}
+
+        if len(current_iname_set) >= min_iname_count:
+            # Success: found a big enough group of inames for a conditional.
+            # See if there are bounds checks available for that set.
+
+            from loopy.codegen.bounds import generate_bounds_checks
+            bounds_checks = generate_bounds_checks(kernel.domain, current_iname_set,
+                    codegen_state.implemented_domain)
         else:
-            raise RuntimeError("unexpected schedule item type: %s"
-                    % type(sched_item))
+            bounds_checks = []
+
+        if bounds_checks:
+            check_set = isl.BasicSet.universe(kernel.space)
+            for cns in bounds_checks:
+                check_set = check_set.add_constraint(cns)
+
+            new_codegen_state = codegen_state.intersect(check_set)
+        else:
+            new_codegen_state = codegen_state
+
+        if idx == 1:
+            # group only contains starting schedule item
+            result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)]
+        else:
+            # recurse with a bigger iname count
+            result = build_insn_group(sched_indices_and_cond_inames[0:idx],
+                    new_codegen_state, len(current_iname_set)+1)
+
+        if bounds_checks:
+            from loopy.codegen import wrap_in_if
+            from loopy.codegen.bounds import constraint_to_code
+            result = [wrap_in_if(
+                    [constraint_to_code(codegen_state.c_code_mapper, cns) for cns in bounds_checks],
+                    gen_code_block(result))]
+
+        return result + build_insn_group(
+                sched_indices_and_cond_inames[idx:], codegen_state)
 
+    # }}}
 
-    return gen_code_block(result)
+    return gen_code_block(
+            build_insn_group(zip(
+                my_sched_indices, admissible_cond_inames), codegen_state))
 
 
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e9ddcf2c0..1d1bc55d5 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -41,45 +41,11 @@ class ILPInstance(Record):
 
 
 def generate_ilp_instances(kernel, insn, codegen_state):
-    assignments = {}
     impl_domain = codegen_state.implemented_domain
 
-    from loopy.kernel import (TAG_ILP,
-            TAG_LOCAL_IDX, TAG_GROUP_IDX)
+    from loopy.kernel import TAG_ILP
 
-    from pymbolic import var
-
-    # {{{ pass 1: assign all hw-parallel dimensions
-
-    global_size, local_size = kernel.get_grid_sizes()
-
-    for iname in insn.all_inames():
-        tag = kernel.iname_to_tag.get(iname)
-
-        if isinstance(tag, TAG_LOCAL_IDX):
-            hw_axis_expr = var("lid")(tag.axis)
-            hw_axis_size = local_size[tag.axis]
-
-        elif isinstance(tag, TAG_GROUP_IDX):
-            hw_axis_expr = var("gid")(tag.axis)
-            hw_axis_size = global_size[tag.axis]
-
-        else:
-            continue
-
-        bounds = kernel.get_iname_bounds(iname)
-
-        from loopy.isl import make_slab
-        slab = make_slab(impl_domain.get_space(), iname,
-                bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size)
-        impl_domain = impl_domain.intersect(slab)
-
-        from loopy.symbolic import pw_aff_to_expr
-        assignments[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) + hw_axis_expr
-
-    # }}} 
-
-    result = [ILPInstance(impl_domain, assignments, frozenset())]
+    result = [ILPInstance(impl_domain, {}, frozenset())]
 
     # {{{ pass 2: treat all ILP dimensions
 
@@ -118,3 +84,9 @@ def generate_instruction_code(kernel, insn, codegen_state):
 
     from loopy.codegen import gen_code_block
     return gen_code_block(result)
+
+
+
+
+
+# vim: foldmethod=marker
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index e667a9d76..20405c0ff 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -12,7 +12,6 @@ from loopy.codegen.dispatch import build_loop_nest
 
 
 def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain):
-    from loopy.isl import cast_constraint_to_space
     from loopy.codegen.bounds import get_bounds_constraints, get_defined_inames
     lower_constraints_orig, upper_constraints_orig, equality_constraints_orig = \
             get_bounds_constraints(kernel.domain, iname,
@@ -32,16 +31,15 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain):
 
 # {{{ conditional-minimizing slab decomposition
 
-def get_slab_decomposition(kernel, sched_index, exec_domain):
+def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
     from loopy.isl import block_shift_constraint, negate_constraint
 
-    ccm = exec_domain.c_code_mapper
+    ccm = codegen_state.c_code_mapper
     space = kernel.space
-    iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
 
     lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname,
-            exec_domain.implemented_domain)
+            codegen_state.implemented_domain)
 
     lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0))
 
@@ -83,11 +81,10 @@ def get_slab_decomposition(kernel, sched_index, exec_domain):
 
 # {{{ unrolled/ILP loops
 
-def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
+def generate_unroll_loop(kernel, sched_index, codegen_state):
     from loopy.isl import block_shift_constraint
-    from loopy.codegen.bounds import solve_constraint_for_bound
 
-    from cgen import (POD, Assign, Line, Statement as S, Initializer, Const)
+    from cgen import (POD, Line)
 
     ccm = codegen_state.c_code_mapper
     space = kernel.space
@@ -97,12 +94,6 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
     lower_cns, upper_cns = get_simple_loop_bounds(kernel, sched_index, iname,
             codegen_state.implemented_domain)
 
-    lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname)
-    upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname)
-
-    assert lower_kind == ">="
-    assert upper_kind == "<"
-
     bounds = kernel.get_iname_bounds(iname)
     from loopy.isl import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
@@ -111,7 +102,8 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
     lower_bound_pw_aff_pieces = bounds.lower_bound_pw_aff.coalesce().get_pieces()
 
     if len(lower_bound_pw_aff_pieces) > 1:
-        raise NotImplementedError("lower bound for ILP/unroll needed conditional")
+        raise NotImplementedError("lower bound for unroll needs conditional/"
+                "has more than one piece")
 
     (_, lower_bound_aff), = lower_bound_pw_aff_pieces
 
@@ -122,7 +114,7 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
                             block_shift_constraint(
                                 lower_cns, iname, -i, as_equality=True)))
 
-    from loopy.kernel import TAG_ILP, TAG_UNROLL
+    from loopy.kernel import TAG_UNROLL
     if isinstance(tag, TAG_UNROLL):
         result = [POD(np.int32, iname), Line()]
 
@@ -134,23 +126,6 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
 
         return gen_code_block(result)
 
-    elif isinstance(tag, TAG_ILP):
-        new_ilp_instances = []
-        for ilpi in codegen_state.ilp_instances:
-            for i in range(length):
-                idx_aff = lower_bound_aff + i
-                new_ilp_instances.append(ilpi.fix(iname, idx_aff))
-
-        overall_slab = (isl.Set.universe(kernel.space)
-                .add_constraint(lower_cns)
-                .add_constraint(upper_cns))
-
-        return build_loop_nest(kernel, sched_index+1,
-                CodeGenerationState(
-                    codegen_state.implemented_domain.intersect(overall_slab),
-                    codegen_state.c_code_mapper,
-                    new_ilp_instances))
-
     else:
         raise RuntimeError("unexpected tag")
 
@@ -158,36 +133,61 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state):
 
 # {{{ parallel loop
 
-def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain):
-    from loopy.isl import make_slab
+def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
+    from loopy.kernel import UniqueTag, HardwareParallelTag, TAG_LOCAL_IDX, TAG_GROUP_IDX
 
-    ccm = exec_domain.c_code_mapper
-    space = kernel.space
-    iname = kernel.schedule[sched_index].iname
+    if hw_inames_left is None:
+        hw_inames_left = [iname
+                for iname in kernel.all_inames()
+                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
+
+    from loopy.codegen.dispatch import build_loop_nest
+    if not hw_inames_left:
+        return build_loop_nest(kernel, sched_index, codegen_state)
+
+    global_size, local_size = kernel.get_grid_sizes()
+
+    iname = hw_inames_left.pop()
     tag = kernel.iname_to_tag.get(iname)
 
-    lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition(
-            kernel, sched_index, exec_domain)
+    assert isinstance(tag, UniqueTag)
 
-    # For a parallel loop dimension, the global loop bounds are
-    # automatically obeyed--simply because no work items are launched
-    # outside the requested grid.
-    #
-    # For a forced length, this is implemented by an if below.
+    other_inames_with_same_tag = [
+            other_iname for other_iname in kernel.all_inames()
+            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
+            and kernel.iname_to_tag.get(other_iname).key == tag.key
+            and other_iname != iname]
 
-    if tag.forced_length is None:
-        exec_domain = exec_domain.intersect(
-                isl.Set.universe(kernel.space)
-                .add_constraint(lb_cns_orig)
-                .add_constraint(ub_cns_orig))
+    # {{{ 'implement' hardware axis boundaries
+
+    if isinstance(tag, TAG_LOCAL_IDX):
+        hw_axis_size = local_size[tag.axis]
+    elif isinstance(tag, TAG_GROUP_IDX):
+        hw_axis_size = global_size[tag.axis]
     else:
-        impl_len = tag.forced_length
-        start, _, _ = kernel.get_bounds(iname, (iname,), allow_parameters=True)
-        exec_domain = exec_domain.intersect(
-                make_slab(kernel.space, iname, start, start+impl_len))
+        raise RuntimeError("unknown hardware parallel tag")
+
+    result = []
+
+    bounds = kernel.get_iname_bounds(iname)
+
+    from loopy.isl import make_slab
+    slab = make_slab(kernel.space, iname,
+            bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size)
+    codegen_state = codegen_state.intersect(slab)
+
+    # }}}
+
+    lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition(
+            kernel, iname, sched_index, codegen_state)
+
+    if other_inames_with_same_tag and len(slabs) > 1:
+        raise RuntimeError("cannot do slab decomposition on inames that share "
+                "a tag with other inames")
+
+    ccm = codegen_state.c_code_mapper
 
     result = []
-    nums_of_conditionals = []
 
     from loopy.codegen import add_comment
 
@@ -196,11 +196,10 @@ def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain):
         if len(slabs) == 1:
             cmt = None
 
-        new_kernel = kernel.copy(
-                domain=kernel.domain.intersect(slab))
-        result.append(
-                add_comment(cmt,
-                    build_loop_nest(new_kernel, sched_index+1, exec_domain)))
+        new_kernel = kernel.copy(domain=kernel.domain.intersect(slab))
+        inner = set_up_hw_parallel_loops(
+                new_kernel, sched_index, codegen_state, hw_inames_left)
+        result.append(add_comment(cmt, inner))
 
     from loopy.codegen import gen_code_block
     return gen_code_block(result, is_alternatives=True)
@@ -209,31 +208,28 @@ def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain):
 
 # {{{ sequential loop
 
-def generate_sequential_loop_dim_code(kernel, sched_index, exec_domain):
-
-    ccm = exec_domain.c_code_mapper
+def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
+    ccm = codegen_state.c_code_mapper
     space = kernel.space
     iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
 
     lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition(
-            kernel, sched_index, exec_domain)
+            kernel, iname, sched_index, codegen_state)
 
     result = []
-    nums_of_conditionals = []
 
     for slab_name, slab in slabs:
         cmt = "%s slab for '%s'" % (slab_name, iname)
         if len(slabs) == 1:
             cmt = None
 
-        new_exec_domain = exec_domain.intersect(slab)
+        new_codegen_state = codegen_state.intersect(slab)
         inner = build_loop_nest(kernel, sched_index+1,
-                new_exec_domain)
+                new_codegen_state)
 
         from loopy.codegen.bounds import wrap_in_for_from_constraints
 
-        # regular loop
         if cmt is not None:
             from cgen import Comment
             result.append(Comment(cmt))
diff --git a/loopy/isl.py b/loopy/isl.py
index ef1b6cf51..055b65655 100644
--- a/loopy/isl.py
+++ b/loopy/isl.py
@@ -9,21 +9,6 @@ from islpy import dim_type
 
 
 
-def cast_constraint_to_space(cns, new_space, as_equality=None):
-    1/0 # bad routine, shouldn't be used
-
-    if as_equality is None:
-        as_equality = cns.is_equality()
-
-    if as_equality:
-        factory = isl.Constraint.eq_from_names
-    else:
-        factory = isl.Constraint.ineq_from_names
-    return factory(new_space, cns.get_coefficients_by_name())
-
-
-
-
 def block_shift_constraint(cns, type, pos, multiple, as_equality=None):
     if as_equality != cns.is_equality():
         if as_equality:
@@ -107,10 +92,13 @@ def pw_aff_to_aff(pw_aff):
 
 
 
-def dump_local_space(ls):
+def dump_space(ls):
     return " ".join("%s: %d" % (dt, ls.dim(getattr(dim_type, dt))) 
             for dt in dim_type.names)
 
+
+
+
 def make_slab(space, iname, start, stop):
     zero = isl.Aff.zero_on_domain(space)
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 2a8256ec4..0da1d5671 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -35,7 +35,7 @@ class UniqueTag(IndexTag):
     def key(self):
         return type(self)
 
-class ParallelTagWithAxis(ParallelTag, UniqueTag):
+class HardwareParallelTag(ParallelTag, UniqueTag):
     __slots__ = ["axis"]
 
     def __init__(self, axis):
@@ -56,10 +56,10 @@ class ParallelTagWithAxis(ParallelTag, UniqueTag):
     
 
 
-class TAG_GROUP_IDX(ParallelTagWithAxis):
+class TAG_GROUP_IDX(HardwareParallelTag):
     print_name = "g"
 
-class TAG_LOCAL_IDX(ParallelTagWithAxis):
+class TAG_LOCAL_IDX(HardwareParallelTag):
     print_name = "l"
 
 class TAG_AUTO_LOCAL_IDX(ParallelTag):
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 5fef6bcfb..0dbb8af74 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -667,13 +667,6 @@ def insert_barriers(kernel, schedule, level=0):
 
 
 
-def insert_parallel_dim_check_points(kernel, schedule):
-    # FIXME: Unimplemented
-    return kernel
-
-
-
-
 def generate_loop_schedules(kernel):
     kernel = realize_reduction(kernel)
 
@@ -701,12 +694,46 @@ def generate_loop_schedules(kernel):
         gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
         assert not owed_barriers
 
-        schedule = insert_parallel_dim_check_points(kernel, gen_sched)
-
         yield kernel.copy(schedule=gen_sched)
 
 
 
 
 
+# {{{ schedule utilities
+
+def find_active_inames_at(kernel, sched_index):
+    active_inames = []
+
+    from loopy.schedule import EnterLoop, LeaveLoop
+    for sched_item in kernel.schedule[:sched_index]:
+        if isinstance(sched_item, EnterLoop):
+            active_inames.append(sched_item.iname)
+        if isinstance(sched_item, LeaveLoop):
+            active_inames.pop()
+
+    return set(active_inames)
+
+
+
+
+def has_barrier_within(kernel, sched_index):
+    sched_item = kernel.schedule[sched_index]
+
+    if isinstance(sched_item, EnterLoop):
+        loop_contents, _ = gather_schedule_subloop(
+                kernel.schedule, sched_index)
+        from pytools import any
+        return any(isinstance(subsched_item, Barrier)
+                for subsched_item in loop_contents)
+    elif isinstance(sched_item, Barrier):
+        return True
+    else:
+        return False
+
+# }}}
+
+
+
+
 # vim: foldmethod=marker
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 6a54e5864..42ebf7e9e 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -216,7 +216,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
     knl = lp.split_dimension(knl, "i", 16,
             outer_tag="g.0", inner_tag="l.1", no_slabs=True)
     knl = lp.split_dimension(knl, "j", 16,
-            outer_tag="g.1", inner_tag="l.0", no_slabs=True)
+            outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_dimension(knl, "k", 16)
 
     knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"])
-- 
GitLab