From a86fd84e85faa9f4687685507ee01edd3673e686 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 24 Sep 2011 16:51:34 -0400
Subject: [PATCH] Towards insn-based loopy.

---
 MEMO                      |  83 +++++
 loopy/__init__.py         | 544 +++++++++++++++++++++++++++++--
 loopy/codegen/bounds.py   |  12 +-
 loopy/codegen/prefetch.py |   6 +-
 loopy/isl.py              |  65 +++-
 loopy/kernel.py           | 668 ++++++++++++++++++++------------------
 loopy/schedule.py         | 106 +++++-
 loopy/symbolic.py         | 283 +++++++++++++---
 loopy/tools.py            |  13 +
 test/test_isl.py          |  27 ++
 test/test_matmul.py       |  26 +-
 11 files changed, 1408 insertions(+), 425 deletions(-)
 create mode 100644 MEMO
 create mode 100644 loopy/tools.py
 create mode 100644 test/test_isl.py

diff --git a/MEMO b/MEMO
new file mode 100644
index 000000000..c08463c3f
--- /dev/null
+++ b/MEMO
@@ -0,0 +1,83 @@
+TODO list
+^^^^^^^^^
+
+Immediately:
+------------
+TODO: Imitate codegen bulk slab handling in bulk slab trials
+
+For writeup:
+------------
+TODO: Reimplement forced lengths
+TODO: Try, fix reg. prefetch (DG example) / CSEs
+  ILP and reg. prefetch interact!
+TODO: Custom reductions per red. axis
+TODO: Functions
+TODO: Common subexpressions
+TODO: Array common subexpressions (shared and private!)
+TODO: ILP arrays
+FIXME: support non-reductive dimensions (what did I mean here?)
+FIXME: write names should be assigned during scheduling
+FIXME: screwy lower bounds in ILP
+FIXME: Leading syncthreads elimination
+
+TODO: Divisibility
+TODO: Try, fix indirect addressing
+
+TODO: Implement GT200 matmul, Fermi matmul, DG
+TODO: DMA engine threads?
+TODO: Deal with equalities that crop up.
+TODO: Better user feedback.
+
+Later:
+------
+TODO: Try different kernels
+TODO:   - Tricky: Convolution, Stencil
+TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?)
+TODO: implement efficient ceil_div? (as opposed to floor_div)
+TODO: why are corner cases inefficient?
+TODO: Use gists (why do disjoint sets arise?)
+TODO: variable shuffle detection
+
+Things to consider
+^^^^^^^^^^^^^^^^^^
+
+- implemented_domain may end up being smaller than requested in cse
+  evaluations--check that!
+
+- Instructions must agree on all iname tags except the parallel ones
+
+- Auto tag assignment depends on known work group size
+
+- Depedencies are pointwise for shared loop dimensions
+  and global over non-shared ones (between dependent and ancestor)
+
+- Parallel dimension splitting/merging via tags
+
+- Generalize reduction to be over multiplie variables
+
+- Implement get_problems()
+
+
+Dealt with
+^^^^^^^^^^
+
+- Reduction needs to know a neutral element
+
+- Types of reduction variables?
+
+How to represent the schedule
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Focus everything on instructions
+  - Each instruction can have its own interpretation of global/local ids.
+- Loop variables/splits and such are and remain global
+- What about grouped dimensions?
+- UniqueTag is the wrong idea! (not really--it's ok per-insn)
+
+Scheduling:
+- Find insns whose dependencies are satisfied
+- Find maximally shareable loop
+- Open that one
+- For that opened loop, check if an available insn can run
+  - If not, open another loop
+  - Else, schedule that instruction
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 03d0ddb50..7e0474c4d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,4 +1,5 @@
 from __future__ import division
+import isl
 
 def register_mpz_with_pymbolic():
     from pymbolic.primitives import register_constant_class
@@ -8,42 +9,14 @@ def register_mpz_with_pymbolic():
 
 register_mpz_with_pymbolic()
 
+import islpy as isl
+from islpy import dim_type
+import numpy as np
 
 
-# Immediately:
-# ------------
-# TODO: Imitate codegen bulk slab handling in bulk slab trials
 
-# For writeup:
-# ------------
-# TODO: Try, fix reg. prefetch (DG example) / CSEs
-#   ILP and reg. prefetch interact!
-# TODO: Custom reductions per red. axis
-# TODO: Functions
-# TODO: Common subexpressions
-# TODO: Array common subexpressions (shared and private!)
-# TODO: ILP arrays
-# FIXME: support non-reductive dimensions (what did I mean here?)
-# FIXME: write names should be assigned during scheduling
-# FIXME: screwy lower bounds in ILP
-# FIXME: Leading syncthreads elimination
 
-# TODO: Divisibility
-# TODO: Try, fix indirect addressing
-
-# TODO: Implement GT200 matmul, Fermi matmul, DG
-# TODO: DMA engine threads?
-# TODO: Deal with equalities that crop up.
-# TODO: Better user feedback.
-
-# Later:
-# ------
-# TODO: Try different kernels
-# TODO:   - Tricky: Convolution, Stencil
-# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?)
-# TODO: implement efficient ceil_div? (as opposed to floor_div)
-# TODO: why are corner cases inefficient?
-# TODO: Use gists (why do disjoint sets arise?)
+# TODO list moved to MEMO in root
 
 
 
@@ -62,16 +35,516 @@ from loopy.compiled import CompiledKernel, drive_timing_run
 
 # }}}
 
-# {{{ high-level modifiers
+# {{{ user-facing kernel manipulation functionality
+
+def split_dimension(kernel, name, inner_length, padded_length=None,
+        outer_name=None, inner_name=None,
+        outer_tag=None, inner_tag=None,
+        outer_slab_increments=(0, -1), no_slabs=None):
+
+    if name not in kernel.all_inames():
+        raise ValueError("cannot split loop for unknown variable '%s'" % name)
+
+    if no_slabs:
+        outer_slab_increments = (0, 0)
+
+    if padded_length is not None:
+        inner_tag = inner_tag.copy(forced_length=padded_length)
+
+    if outer_name is None:
+        outer_name = name+"_outer"
+    if inner_name is None:
+        inner_name = name+"_inner"
+
+    outer_var_nr = kernel.space.dim(dim_type.set)
+    inner_var_nr = kernel.space.dim(dim_type.set)+1
+
+    def process_set(s):
+        s = s.add_dims(dim_type.set, 2)
+        s.set_dim_name(dim_type.set, outer_var_nr, outer_name)
+        s.set_dim_name(dim_type.set, inner_var_nr, inner_name)
+
+        from loopy.isl import make_slab
+
+        space = s.get_space()
+        inner_constraint_set = (
+                make_slab(space, inner_name, 0, inner_length)
+                # name = inner + length*outer
+                .add_constraint(isl.Constraint.eq_from_names(
+                    space, {name:1, inner_name: -1, outer_name:-inner_length})))
+
+        name_dim_type, name_idx = space.get_var_dict()[name]
+        return (s
+                .intersect(inner_constraint_set)
+                .project_out(name_dim_type, name_idx, 1))
+
+    new_domain = process_set(kernel.domain) 
+    new_assumptions = process_set(kernel.assumptions)
+
+    from pymbolic import var
+    inner = var(inner_name)
+    outer = var(outer_name)
+    new_loop_index = inner + outer*inner_length
+
+    # {{{ look for reduction loops to split, split seq. tags
+
+    from loopy.symbolic import ReductionLoopSplitter
+
+    rls = ReductionLoopSplitter(name, outer_name, inner_name)
+    new_insns = []
+    for insn in kernel.instructions:
+        insn = insn.copy(expression=rls(insn.expression))
+        old_iname_tag = insn.iname_to_tag.get(name)
+
+        new_iname_to_tag = insn.iname_to_tag.copy()
+        tagged_ok = False
+        from loopy.kernel import SequentialTag
+        if isinstance(old_iname_tag, SequentialTag):
+            tagged_ok = True
+            del new_iname_to_tag[name]
+            new_iname_to_tag[outer_name] = old_iname_tag
+            new_iname_to_tag[inner_name] = old_iname_tag
+
+        if name in insn.forced_iname_deps:
+            new_forced_iname_deps = insn.forced_iname_deps[:]
+            new_forced_iname_deps.remove(name)
+            new_forced_iname_deps.extend([outer_name, inner_name])
+        else:
+            new_forced_iname_deps = insn.forced_iname_deps
+
+        insn = insn.substitute(name, new_loop_index, tagged_ok=tagged_ok).copy(
+                iname_to_tag=new_iname_to_tag,
+                forced_iname_deps=new_forced_iname_deps
+                )
+
+        new_insns.append(insn)
+
+    # }}}
+
+    iname_slab_increments = kernel.iname_slab_increments.copy()
+    iname_slab_increments[outer_name] = outer_slab_increments
+    result = (kernel
+            .copy(domain=new_domain,
+                assumptions=new_assumptions,
+                iname_slab_increments=iname_slab_increments,
+                name_to_dim=None,
+                instructions=new_insns))
+
+    return tag_dimensions(result, {outer_name: outer_tag, inner_name: inner_tag})
+
+
+
+
+def tag_dimensions(kernel, iname_to_tag, insn_id=None):
+    from loopy.kernel import UniqueTag, ParallelTag, parse_tag
+
+    iname_to_tag = dict((iname, parse_tag(tag))
+            for iname, tag in iname_to_tag.iteritems())
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if insn_id is None or insn_id == insn.id:
+            new_iname_to_tag = insn.iname_to_tag.copy()
+
+            existing_unique_tag_keys = set(
+                    tag.key for tag in new_iname_to_tag.itervalues()
+                    if isinstance(tag, UniqueTag))
+
+            for iname, tag in iname_to_tag.iteritems():
+                if iname not in insn.all_inames():
+                    continue
+
+                if isinstance(tag, ParallelTag) and iname in insn.sequential_inames():
+                    raise NotImplementedError("cannot parallelize reduction dimension (yet)")
+
+                new_iname_to_tag[iname] = tag
+
+                if isinstance(tag, UniqueTag):
+                    tag_key = tag.key
+                    if tag_key in existing_unique_tag_keys:
+                        raise RuntimeError("repeated unique tag key: %s" % tag_key)
+
+                    existing_unique_tag_keys.add(tag_key)
+
+            new_insns.append(
+                    insn.copy(iname_to_tag=new_iname_to_tag))
+        else:
+            new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+
+
+
+
+def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=None,
+        dup_iname_to_tag={}, new_inames=None):
+    """
+    :arg duplicate_inames:
+        also determines index order of temporary array
+    :arg parallel_inames: only a convenient interface for dup_iname_to_tag
+    """
+
+    dtype = np.dtype(dtype)
+
+    from pytools import any
+
+    # {{{ process parallel_inames and dup_iname_to_tag arguments
+
+    if parallel_inames is None:
+        # default to all-parallel
+        parallel_inames = duplicate_inames
+
+    dup_iname_to_tag = dup_iname_to_tag.copy()
+    from loopy.kernel import TAG_AUTO_WORK_ITEM_IDX
+    for piname in parallel_inames:
+        dup_iname_to_tag[piname] = TAG_AUTO_WORK_ITEM_IDX()
+
+    for diname in duplicate_inames:
+        dup_iname_to_tag.setdefault(diname, None)
+
+    if not set(dup_iname_to_tag.iterkeys()) <= set(duplicate_inames):
+        raise RuntimeError("paralleization/tag info for non-duplicated inames "
+                "may not be passed")
+
+    # here, all information is consolidated into dup_iname_to_tag
+
+    # }}}
+
+    # {{{ process new_inames argument, think of new inames for inames to be duplicated
+
+    if new_inames is None:
+        new_inames = [None] * len(duplicate_inames)
+
+    temp_new_inames = []
+    for old_iname, new_iname in zip(duplicate_inames, new_inames):
+        if new_iname is None:
+            new_iname = kernel.make_unique_var_name(old_iname)
+        temp_new_inames.append(new_iname)
+
+    new_inames = temp_new_inames
+
+    # }}}
+
+    target_var_name = kernel.make_unique_var_name(cse_tag)
+
+    cse_lookup_table = {}
+
+    cse_result_insns = []
+
+    def map_cse(expr, rec):
+        if expr.prefix != cse_tag:
+            return
+
+        # FIXME stencils and variable shuffle detection would happen here
+
+        try:
+            cse_replacement, dep_id = cse_lookup_table[expr]
+        except KeyError:
+            pass
+        else:
+            return cse_replacement
+
+        if cse_result_insns:
+            raise RuntimeError("CSE tag '%s' is not unique" % cse_tag)
+
+        # {{{ concoct new inner and outer expressions
+
+        from pymbolic import var
+        assignee = var(target_var_name)
+        new_outer_expr = assignee
+
+        if duplicate_inames:
+            assignee = assignee[tuple(
+                var(iname) for iname in new_inames
+                )]
+            new_outer_expr = new_outer_expr[tuple(
+                var(iname) for iname in duplicate_inames
+                )]
+
+        from pymbolic import substitute
+        new_inner_expr = substitute(rec(expr.child), dict(
+            (old_iname, var(new_iname))
+            for old_iname, new_iname in zip(duplicate_inames, new_inames)))
+
+        # }}}
+
+        # {{{ build the new instruction's iname_to tag
 
-def split_dimension(knl, *args, **kwargs):
-    return knl.split_dimension(*args, **kwargs)
+        from loopy.symbolic import IndexVariableFinder
+        insn_iname_to_tag = {}
+        for old_iname, new_iname in zip(duplicate_inames, new_inames):
+            insn_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname]
+
+        index_deps = (
+                IndexVariableFinder()(new_outer_expr) 
+                | set(new_inames))
+
+        for iname in index_deps:
+            if iname not in insn_iname_to_tag:
+                # assume generating instruction's view on
+                # inames on which we don't have an opinion.
+
+                if iname in insn.iname_to_tag:
+                    insn_iname_to_tag[iname] = insn.iname_to_tag[iname]
+
+        # }}}
+
+        from loopy.kernel import Instruction
+        new_insn = Instruction(
+                id=kernel.make_unique_instruction_id(based_on=cse_tag),
+                assignee=assignee,
+                expression=new_inner_expr,
+                iname_to_tag=insn_iname_to_tag,
+                )
+
+        cse_result_insns.append(new_insn)
+
+        return new_outer_expr
+
+    from loopy.symbolic import CSECallbackMapper
+    cse_cb_mapper = CSECallbackMapper(map_cse)
+
+    new_insns = []
+    for insn in kernel.instructions:
+        was_empty = not bool(cse_result_insns)
+        new_expr = cse_cb_mapper(insn.expression)
+
+        if was_empty and cse_result_insns:
+            new_iname_to_tag = insn.iname_to_tag.copy()
+            new_iname_to_tag.update(dup_iname_to_tag)
+
+            new_insns.append(
+                    insn.copy(
+                        expression=new_expr,
+                        iname_to_tag=new_iname_to_tag))
+        else:
+            new_insns.append(insn)
+
+    new_insns.extend(cse_result_insns)
+
+    # {{{ build new domain, duplicating each constraint on duplicated inames
+
+    start_idx = kernel.domain.dim(dim_type.set)
+    new_domain = kernel.domain.insert_dims(
+            dim_type.set, start_idx,
+            len(duplicate_inames))
+    new_name_to_dim = kernel.name_to_dim.copy()
+    for i, iname in enumerate(new_inames):
+        new_idx = start_idx+i
+        new_domain = new_domain.set_dim_name(
+                dim_type.set, new_idx, iname)
+        new_name_to_dim[iname] = (dim_type.set, new_idx)
+
+    dup_iname_dims = [kernel.name_to_dim[iname]
+            for iname in duplicate_inames]
+    old_to_new = dict((old_iname, new_iname)
+        for old_iname, new_iname in zip(duplicate_inames, new_inames))
+
+    new_domain_bs, = new_domain.get_basic_sets()
+
+    for cns in new_domain_bs.get_constraints():
+        if any(cns.involves_dims(*dim+(1,)) for dim in dup_iname_dims):
+            assert not cns.is_div_constraint()
+            if cns.is_equality():
+                new_cns = cns.equality_alloc(new_domain.get_space())
+            else:
+                new_cns = cns.inequality_alloc(new_domain.get_space())
+
+            new_coeffs = {}
+            for key, val in cns.get_coefficients_by_name().iteritems():
+                if key in old_to_new:
+                    new_coeffs[old_to_new[key]] = val
+                else:
+                    new_coeffs[key] = val
+
+            new_cns = new_cns.set_coefficients_by_name(new_coeffs)
+            new_domain = new_domain.add_constraint(new_cns)
+
+    # }}}
+
+    # {{{ set up data for temp variable
+
+    temp_var_base_indices = []
+    temp_var_shape = []
+
+    for iname in new_inames:
+        lower_bound_pw_aff = new_domain.dim_min(new_name_to_dim[iname][1])
+        upper_bound_pw_aff = new_domain.dim_max(new_name_to_dim[iname][1])
+
+        from loopy.isl import static_max_of_pw_aff
+        from loopy.symbolic import pw_aff_to_expr
+
+        temp_var_shape.append(static_max_of_pw_aff(
+                upper_bound_pw_aff - lower_bound_pw_aff + 1))
+        temp_var_base_indices.append(pw_aff_to_expr(lower_bound_pw_aff))
+
+    from loopy.kernel import TemporaryVariable, ParallelTag
+    new_temporary_variables = kernel.temporary_variables + [
+            TemporaryVariable(
+                name=target_var_name,
+                dtype=dtype,
+                base_indices=temp_var_base_indices,
+                shape=temp_var_shape,
+                is_local=any(isinstance(tag, ParallelTag)
+                    for tag in dup_iname_to_tag.iterkeys()))
+            ]
+
+    # }}}
+
+    return kernel.copy(
+            domain=new_domain,
+            instructions=new_insns,
+            temporary_variables=new_temporary_variables,
+            name_to_dim=new_name_to_dim)
+
+
+
+
+def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None):
+    dtype = np.dtype(dtype)
+
+    new_insns = []
+    new_temporary_variables = kernel.temporary_variables[:]
+
+    def map_reduction(expr, rec):
+        sub_expr = rec(expr.expr)
+
+        if reduction_tag is not None and expr.tag != reduction_tag:
+            return
+
+        from pymbolic import var
+
+        from loopy.kernel import REGISTERED_REDUCTION_OPS
+        red_op = REGISTERED_REDUCTION_OPS[expr.operation]
+
+        target_var_name = kernel.make_unique_var_name("red_"+loop_iname,
+                extra_used_vars=set(tv.name for tv in new_temporary_variables))
+        target_var = var(target_var_name)
+
+        from loopy.kernel import Instruction
+
+        from loopy.kernel import TemporaryVariable
+        new_temporary_variables.append(
+                TemporaryVariable(
+                    name=target_var_name,
+                    dtype=dtype,
+                    shape=(),
+                    is_local=False))
+
+        init_iname_to_tag = insn.iname_to_tag.copy()
+
+        init_insn = Instruction(
+                id=kernel.make_unique_instruction_id(
+                    extra_used_ids=set(ni.id for ni in new_insns)),
+                assignee=target_var,
+                forced_iname_deps=list(insn.all_inames() - set(expr.iname)),
+                expression=red_op.neutral_element)
+
+        new_insns.append(init_insn)
+
+        from loopy.kernel import SequentialTag
+        reduction_insn = Instruction(
+                id=kernel.make_unique_instruction_id(
+                    extra_used_ids=set(ni.id for ni in new_insns)),
+                assignee=target_var,
+                expression=red_op(target_var, sub_expr),
+                forced_insn_deps=[init_insn.id],
+                use_auto_dependencies=False,
+                forced_iname_deps=list(insn.all_inames()),
+                iname_to_tag={expr.iname: SequentialTag()})
+
+        new_insns.append(reduction_insn)
+
+        new_insn_forced_insn_deps.append(reduction_insn.id)
+
+        return target_var
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(map_reduction)
+
+    for insn in kernel.instructions:
+        new_insn_forced_insn_deps = []
+
+        new_insns.append(
+                insn.copy(
+                    expression=cb_mapper(insn.expression),
+                    forced_insn_deps=insn.forced_insn_deps
+                        + new_insn_forced_insn_deps,
+                    ))
+
+    return kernel.copy(
+            instructions=new_insns,
+            temporary_variables=new_temporary_variables)
+
+
+
+
+def get_problems(kernel, parameters, emit_warnings=True):
+    """
+    :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5.
+        '5' means 'will certainly not run'.
+    """
+    msgs = []
+
+    def msg(severity, s):
+        if emit_warnings:
+            from warnings import warn
+            from loopy import LoopyAdvisory
+            warn(s, LoopyAdvisory)
+
+        msgs.append((severity, s))
+
+    glens = kernel.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True)
+    llens = kernel.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False)
+
+    from pymbolic import evaluate
+    glens = evaluate(glens, parameters)
+    llens = evaluate(llens, parameters)
+
+    if (max(len(glens), len(llens))
+            > kernel.device.max_work_item_dimensions):
+        msg(5, "too many work item dimensions")
+
+    for i in range(len(llens)):
+        if llens[i] > kernel.device.max_work_item_sizes[i]:
+            msg(5, "group axis %d too big" % i)
+
+    from pytools import product
+    if product(llens) > kernel.device.max_work_group_size:
+        msg(5, "work group too big")
+
+    from pyopencl.characterize import usable_local_mem_size
+    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
+        if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL:
+            msg(5, "using too much local memory")
+        else:
+            msg(4, "using more local memory than available--"
+                    "possibly OK due to cache nature")
+
+    const_arg_count = sum(
+            1 for arg in kernel.args
+            if isinstance(arg, ArrayArg) and arg.constant_mem)
+
+    if const_arg_count > kernel.device.max_constant_args:
+        msg(5, "too many constant arguments")
+
+    max_severity = 0
+    for sev, msg in msgs:
+        max_severity = max(sev, max_severity)
+    return max_severity, msgs
+
+# }}}
+
+# {{{ high-level modifiers
 
 def get_input_access_descriptors(kernel):
     """Return a dictionary mapping input vectors to
     a list of input access descriptor. An input access
     descriptor is a tuple (input_vec, index_expr).
     """
+    1/0 # broken
+
     from loopy.symbolic import VariableIndexExpressionCollector
 
     from pytools import flatten
@@ -94,6 +567,7 @@ def add_prefetch(kernel, input_access_descr, fetch_dims, loc_fetch_axes={}):
     :arg fetch_dims: loop dimensions indexing the input variable on which
         the prefetch is to be carried out.
     """
+    1/0 # broken
 
     if isinstance(input_access_descr, str):
         var_name = input_access_descr
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 4c509d4c7..e2cc6f0f3 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -41,7 +41,7 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters):
     upper = []
     equality = []
 
-    space = bset.get_dim()
+    space = bset.get_space()
 
     var_dict = space.get_var_dict()
     iname_tp, iname_idx = var_dict[iname]
@@ -64,6 +64,9 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters):
     return lower, upper, equality
 
 def solve_constraint_for_bound(cns, iname):
+    from warnings import warn
+    warn("deprecated")
+
     from loopy.symbolic import constraint_to_expr
     rhs, iname_coeff = constraint_to_expr(cns, except_name=iname)
 
@@ -94,6 +97,9 @@ def get_bounds(set, iname, admissible_inames, allow_parameters):
     as actual bounds.
     """
 
+    from warnings import warn
+    warn("deprecated")
+
     lower, upper, equality = get_bounds_constraints(
             set, iname, admissible_inames, allow_parameters)
 
@@ -137,7 +143,7 @@ def constraint_to_code(ccm, cns):
     return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op)
 
 def filter_necessary_constraints(implemented_domain, constraints):
-    space = implemented_domain.get_dim()
+    space = implemented_domain.get_space()
     return [cns
         for cns in constraints
         if not implemented_domain.is_subset(
@@ -152,7 +158,7 @@ def generate_bounds_checks(domain, check_vars, implemented_domain):
             .coalesce()
             .get_basic_sets())
 
-    space = domain.get_dim()
+    space = domain.get_space()
 
     cast_constraints = []
 
diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py
index defaaa81a..1b7206a48 100644
--- a/loopy/codegen/prefetch.py
+++ b/loopy/codegen/prefetch.py
@@ -172,12 +172,12 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map,
         # {{{ build an implemented domain with an extra index variable
 
         from loopy.symbolic import eq_constraint_from_expr
-        idx_var_dim_idx = implemented_domain.get_dim().size(dim_type.set)
+        idx_var_dim_idx = implemented_domain.get_space().size(dim_type.set)
         impl_domain_with_index_var = implemented_domain.add_dims(dim_type.set, 1)
         impl_domain_with_index_var = (
                 impl_domain_with_index_var
                 .set_dim_name(dim_type.set, idx_var_dim_idx, idx_var_name))
-        aug_space = impl_domain_with_index_var.get_dim()
+        aug_space = impl_domain_with_index_var.get_space()
         impl_domain_with_index_var = (
                 impl_domain_with_index_var
                 .intersect(
@@ -220,7 +220,7 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map,
                     impl_domain_with_index_var
                     .intersect(
                         make_slab(
-                            impl_domain_with_index_var.get_dim(), idx_var_name,
+                            impl_domain_with_index_var.get_space(), idx_var_name,
                             cur_index,
                             min(dim_length, cur_index+total_realiz_size)))
                     .project_out(dim_type.set, idx_var_dim_idx, 1))
diff --git a/loopy/isl.py b/loopy/isl.py
index e31575e2d..bf7b5a064 100644
--- a/loopy/isl.py
+++ b/loopy/isl.py
@@ -1,3 +1,5 @@
+"""isl helpers"""
+
 from __future__ import division
 
 import islpy as isl
@@ -7,9 +9,9 @@ from islpy import dim_type
 
 
 
-# {{{ isl helpers
-
 def cast_constraint_to_space(cns, new_space, as_equality=None):
+    1/0 # bad routine, shouldn't be used
+
     if as_equality is None:
         as_equality = cns.is_equality()
 
@@ -19,16 +21,30 @@ def cast_constraint_to_space(cns, new_space, as_equality=None):
         factory = isl.Constraint.ineq_from_names
     return factory(new_space, cns.get_coefficients_by_name())
 
-def block_shift_constraint(cns, iname, multiple, as_equality=None):
-    cns = copy_constraint(cns, as_equality=as_equality)
+
+
+
+def block_shift_constraint(cns, type, pos, multiple, as_equality=None):
+    if as_equality != cns.is_equality():
+        if as_equality:
+            factory = isl.Constraint.equality_from_aff
+        else:
+            factory = isl.Constraint.inequality_from_aff
+
+        cns = factory(cns.get_aff())
+
     cns = cns.set_constant(cns.get_constant()
-            + cns.get_coefficients_by_name()[iname]*multiple)
+            + cns.get_coefficient(type, pos)*multiple)
     return cns
 
+
+
+
+
 def negate_constraint(cns):
     assert not cns.is_equality()
     # FIXME hackety hack
-    my_set = (isl.BasicSet.universe(cns.get_dim())
+    my_set = (isl.BasicSet.universe(cns.get_space())
             .add_constraint(cns))
     my_set = my_set.complement()
 
@@ -39,9 +55,8 @@ def negate_constraint(cns):
     result, = results
     return result
 
-def copy_constraint(cns, as_equality=None):
-    return cast_constraint_to_space(cns, cns.get_dim(),
-            as_equality=as_equality)
+
+
 
 def make_index_map(set, index_expr):
     from loopy.symbolic import eq_constraint_from_expr
@@ -52,7 +67,7 @@ def make_index_map(set, index_expr):
     amap = isl.Map.from_domain(set).add_dims(dim_type.out, len(index_expr))
     out_names = ["_ary_idx_%d" % i for i in range(len(index_expr))]
 
-    dim = amap.get_dim()
+    dim = amap.get_space()
     all_constraints = tuple(
             eq_constraint_from_expr(dim, iexpr_i)
             for iexpr_i in index_expr)
@@ -66,6 +81,10 @@ def make_index_map(set, index_expr):
 
     return amap
 
+
+
+
+
 def make_slab(space, iname, start, stop):
     from loopy.symbolic import ineq_constraint_from_expr
     from pymbolic import var
@@ -78,6 +97,30 @@ def make_slab(space, iname, start, stop):
             .add_constraint(ineq_constraint_from_expr(
                 space, stop-1 - var_iname)))
 
-# }}}
+
+
+
+def set_is_universe(set):
+    bs = set.get_basic_sets()
+    if len(bs) == 1:
+        return bs[0].is_universe()
+    else:
+        return isl.Set.universe_like(set).is_subset(set)
+
+
+
+
+def static_max_of_pw_aff(pw_aff):
+    for set, aff in pw_aff.get_pieces():
+        candidate_pw_aff = isl.PwAff.from_aff(aff)
+
+        if set_is_universe(candidate_pw_aff.ge_set(pw_aff)):
+            return aff
+
+    raise ValueError("a static maximum was not found for PwAff '%s'"
+            % pw_aff)
+
+
+
 
 # vim: foldmethod=marker
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 2838d09d0..da04a6545 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -4,14 +4,101 @@ from __future__ import division
 
 import numpy as np
 from pytools import Record, memoize_method
-from pymbolic.mapper.dependency import DependencyMapper
-import pyopencl as cl
 import islpy as isl
 from islpy import dim_type
+from pymbolic import var
 
 
 
 
+
+# {{{ index tags
+
+class IndexTag(Record):
+    __slots__ = []
+
+    def __hash__(self):
+        raise RuntimeError("use .key to hash index tags")
+
+
+
+
+class SequentialTag(IndexTag):
+    pass
+
+class ParallelTag(IndexTag):
+    pass
+
+class UniqueTag(IndexTag):
+    @property
+    def key(self):
+        return type(self)
+
+class ParallelTagWithAxis(ParallelTag, UniqueTag):
+    __slots__ = ["axis"]
+
+    def __init__(self, axis):
+        Record.__init__(self,
+                axis=axis)
+
+    @property
+    def key(self):
+        return (type(self), self.axis)
+
+    def __str__(self):
+        return "%s.%d" % (
+                self.print_name, self.axis)
+
+class TAG_GROUP_IDX(ParallelTagWithAxis):
+    print_name = "g"
+
+class TAG_WORK_ITEM_IDX(ParallelTagWithAxis):
+    print_name = "l"
+
+class TAG_AUTO_WORK_ITEM_IDX(ParallelTag):
+    def __str__(self):
+        return "l.auto"
+
+class TAG_ILP(ParallelTag):
+    def __str__(self):
+        return "ilp"
+
+class BaseUnrollTag(IndexTag):
+    pass
+
+class TAG_UNROLL_STATIC(BaseUnrollTag):
+    def __str__(self):
+        return "unr"
+
+class TAG_UNROLL_INCR(BaseUnrollTag):
+    def __str__(self):
+        return "unri"
+
+def parse_tag(tag):
+    if tag is None:
+        return tag
+
+    if isinstance(tag, IndexTag):
+        return tag
+
+    if not isinstance(tag, str):
+        raise ValueError("cannot parse tag: %s" % tag)
+
+    if tag in ["unrs", "unr"]:
+        return TAG_UNROLL_STATIC()
+    elif tag == "unri":
+        return TAG_UNROLL_INCR()
+    elif tag == "ilp":
+        return TAG_ILP()
+    elif tag.startswith("g."):
+        return TAG_GROUP_IDX(int(tag[2:]))
+    elif tag.startswith("l."):
+        return TAG_WORK_ITEM_IDX(int(tag[2:]))
+    else:
+        raise ValueError("cannot parse tag: %s" % tag)
+
+# }}}
+
 # {{{ arguments
 
 class ArrayArg:
@@ -87,88 +174,225 @@ class ScalarArg:
 
 # }}}
 
-# {{{ index tags
+# {{{ temporary variable
 
-class IndexTag(Record):
-    __slots__ = []
+class TemporaryVariable(Record):
+    """
+    :ivar name:
+    :ivar dtype:
+    :ivar shape:
+    :ivar base_indices:
+    :ivar is_local:
+    """
 
-    def __hash__(self):
-        raise RuntimeError("use .key to hash index tags")
+    def __init__(self, name, dtype, shape, is_local, base_indices=None):
+        if base_indices is None:
+            base_indices = (0,) * len(shape)
 
+        Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local)
 
+    @property
+    def nbytes(self):
+        from pytools import product
+        return product(self.shape)*self.dtype.itemsize
 
+# }}}
 
-class ParallelTag(IndexTag):
-    pass
+# {{{ instruction
 
-class UniqueTag(IndexTag):
-    @property
-    def key(self):
-        return type(self)
+class Instruction(Record):
+    #:ivar kernel: handle to the :class:`LoopKernel` of which this instruction
+        #is a part. (not yet)
+    """
+    :ivar id: An (otherwise meaningless) identifier that is unique within 
+        a :class:`LoopKernel`.
+    :ivar assignee:
+    :ivar expression:
+    :ivar use_auto_dependencies:
+    :ivar forced_iname_deps: a list of inames that are added to the list of iname
+        dependencies
+    :ivar forced_insn_deps: a list of ids of :class:`Instruction` instances that
+        *must* be executed before this one. Note that loop scheduling augments this
+        by adding dependencies on any writes to temporaries read by this instruction
+        *if* use_auto_dependencies is True.
+    :ivar iname_to_tag: a map from loop domain variables to subclasses
+        of :class:`IndexTag`
+    """
+    def __init__(self,
+            id, assignee, expression,
+            use_auto_dependencies=True,
+            forced_iname_deps=[], forced_insn_deps=[],
+            iname_to_tag={}):
 
-class ParallelTagWithAxis(ParallelTag, UniqueTag):
-    __slots__ = ["axis", "forced_length"]
+        # {{{ find and properly tag reduction inames
+
+        reduction_inames = set()
+
+        from loopy.symbolic import ReductionCallbackMapper
+
+        def map_reduction(expr, rec):
+            rec(expr.expr)
+            reduction_inames.add(expr.iname)
+
+        ReductionCallbackMapper(map_reduction)(expression)
+
+        if reduction_inames:
+            iname_to_tag = iname_to_tag.copy()
+
+            for iname in reduction_inames:
+                tag = iname_to_tag.get(iname)
+                if not (tag is None or isinstance(tag, SequentialTag)):
+                    raise RuntimeError("inconsistency detected: "
+                            "sequential/reduction iname '%s' was "
+                            "tagged otherwise" % iname)
+
+                iname_to_tag[iname] = SequentialTag()
+
+        # }}}
 
-    def __init__(self, axis, forced_length=None):
         Record.__init__(self,
-                axis=axis, forced_length=forced_length)
+                id=id, assignee=assignee, expression=expression,
+                use_auto_dependencies=use_auto_dependencies,
+                forced_iname_deps=forced_iname_deps,
+                forced_insn_deps=forced_insn_deps,
+                iname_to_tag=dict(
+                    (iname, parse_tag(tag))
+                    for iname, tag in iname_to_tag.iteritems()))
 
-    @property
-    def key(self):
-        return (type(self), self.axis)
+        unused_tags = set(self.iname_to_tag.iterkeys()) - self.all_inames()
+        if unused_tags:
+            raise RuntimeError("encountered tags for unused inames: "
+                    + ", ".join(unused_tags))
 
-    def __repr__(self):
-        if self.forced_length:
-            return "%s(%d, flen=%d)" % (
-                    self.print_name, self.axis,
-                    self.forced_length)
-        else:
-            return "%s(%d)" % (
-                    self.print_name, self.axis)
+    @memoize_method
+    def all_inames(self):
+        from loopy.symbolic import VariableIndexExpressionCollector
+        idx_exprs = (
+                VariableIndexExpressionCollector()(self.expression)
+                | VariableIndexExpressionCollector()(self.assignee))
+        from pymbolic.mapper.dependency import DependencyMapper
+        index_vars = set()
 
-class TAG_GROUP_IDX(ParallelTagWithAxis):
-    print_name = "GROUP_IDX"
+        from pymbolic.primitives import Variable
+        for idx_expr in idx_exprs:
+            for i in DependencyMapper()(idx_expr):
+                if isinstance(i, Variable):
+                    index_vars.add(i.name)
 
-class TAG_WORK_ITEM_IDX(ParallelTagWithAxis):
-    print_name = "WORK_ITEM_IDX"
+        return index_vars | set(self.forced_iname_deps)
 
-class TAG_ILP(ParallelTag):
-    def __repr__(self):
-        return "TAG_ILP"
+    @memoize_method
+    def sequential_inames(self):
+        result = set()
 
-class BaseUnrollTag(IndexTag):
-    pass
+        for iname, tag in self.iname_to_tag.iteritems():
+            if isinstance(tag, SequentialTag):
+                result.add(iname)
 
-class TAG_UNROLL_STATIC(BaseUnrollTag):
-    def __repr__(self):
-        return "TAG_UNROLL_STATIC"
+        return result
 
-class TAG_UNROLL_INCR(BaseUnrollTag):
-    def __repr__(self):
-        return "TAG_UNROLL_INCR"
+    def substitute(self, old_var, new_expr, tagged_ok=False):
+        from loopy.symbolic import SubstitutionMapper
 
-def parse_tag(tag):
-    if tag is None:
-        return tag
+        prev_tag = self.iname_to_tag.get(old_var)
+        if prev_tag is not None and not tagged_ok:
+            raise RuntimeError("cannot substitute already tagged variable '%s'"
+                    % old_var)
 
-    if isinstance(tag, IndexTag):
-        return tag
+        subst_map = {var(old_var): new_expr}
 
-    if not isinstance(tag, str):
-        raise ValueError("cannot parse tag: %s" % tag)
+        subst_mapper = SubstitutionMapper(subst_map.get)
+        return self.copy(
+                assignee=subst_mapper(self.assignee),
+                expression=subst_mapper(self.expression))
 
-    if tag in ["unrs", "unr"]:
-        return TAG_UNROLL_STATIC()
-    elif tag == "unri":
-        return TAG_UNROLL_INCR()
-    elif tag == "ilp":
-        return TAG_ILP()
-    elif tag.startswith("g."):
-        return TAG_GROUP_IDX(int(tag[2:]))
-    elif tag.startswith("l."):
-        return TAG_WORK_ITEM_IDX(int(tag[2:]))
-    else:
-        raise ValueError("cannot parse tag: %s" % tag)
+    def __str__(self):
+        loop_descrs = []
+        for iname in self.all_inames():
+            tag = self.iname_to_tag.get(iname)
+
+            if tag is None:
+                loop_descrs.append(iname)
+            else:
+                loop_descrs.append("%s: %s" % (iname, tag))
+
+        result = "%s: %s <- %s\n    [%s]" % (self.id,
+                self.assignee, self.expression, ", ".join(loop_descrs))
+
+        return result
+
+# }}}
+
+# {{{ reduction operations
+
+class ReductionOperation:
+    """
+    :ivar neutral_element:
+    """
+
+    def __call__(self, operand1, operand2):
+        raise NotImplementedError
+
+class SumReductionOperation:
+    neutral_element = 0
+
+    def __call__(self, operand1, operand2):
+        return operand1 + operand2
+
+class ProductReductionOperation:
+    neutral_element = 1
+
+    def __call__(self, operand1, operand2):
+        return operand1 * operand2
+
+class FloatingPointMaxOperation:
+    neutral_element = -var("INFINITY")
+
+    def __call__(self, operand1, operand2):
+        return var("max")(operand1, operand2)
+
+class FloatingPointMaxOperation:
+    # OpenCL 1.1, section 6.11.2
+    neutral_element = -var("INFINITY")
+
+    def __call__(self, operand1, operand2):
+        from pymbolic.primitives import FunctionSymbol
+        return FunctionSymbol("max")(operand1, operand2)
+
+class FloatingPointMinOperation:
+    # OpenCL 1.1, section 6.11.2
+    neutral_element = var("INFINITY")
+
+    def __call__(self, operand1, operand2):
+        from pymbolic.primitives import FunctionSymbol
+        return FunctionSymbol("min")(operand1, operand2)
+
+
+
+
+REGISTERED_REDUCTION_OPS = {
+        "sum": SumReductionOperation(),
+        "product": ProductReductionOperation(),
+        "fp_max": FloatingPointMaxOperation(),
+        "fp_min": FloatingPointMinOperation(),
+        }
+
+
+
+def register_reduction(prefix, op):
+    """Register a new :class:`ReductionOperation`.
+
+    :arg prefix: the desired name of the operation
+    :return: the name actually assigned to the operation,
+        will start with *prefix*.
+    """
+
+    from loopy.tools import generate_unique_possibilities
+
+    for name in generate_unique_possibilities(prefix):
+        if name not in REGISTERED_REDUCTION_OPS:
+            REGISTERED_REDUCTION_OPS[name] = op
+            return name
 
 # }}}
 
@@ -178,12 +402,9 @@ class LoopKernel(Record):
     """
     :ivar device: :class:`pyopencl.Device`
     :ivar domain: :class:`islpy.BasicSet`
-    :ivar iname_to_tag:
     :ivar instructions:
     :ivar args:
-    :ivar prefetch:
     :ivar schedule:
-    :ivar register_prefetch:
     :ivar name:
     :ivar preamble:
     :ivar assumptions: the initial implemented_domain, captures assumptions
@@ -191,21 +412,29 @@ class LoopKernel(Record):
     :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
         upper_incr) tuples that will be separated out in the execution to generate
         'bulk' slabs with fewer conditionals.
+    :ivar temporary_variables:
+    :ivar workgroup_size:
+    :ivar name_to_dim: A lookup table from inames to ISL-style
+        (dim_type, index) tuples
     """
 
-    def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None,
-            register_prefetch=None, name="loopy_kernel",
-            iname_to_tag={}, is_divisible=lambda dividend, divisor: False,
+    def __init__(self, device, domain, instructions, args=None, schedule=None,
+            name="loopy_kernel",
             preamble=None, assumptions=None,
-            iname_slab_increments={}):
+            iname_slab_increments={},
+            temporary_variables=[],
+            workgroup_size=None,
+            name_to_dim=None):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
-        :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag`
         """
 
         def parse_if_necessary(insn):
             from pymbolic import parse
+
+            if isinstance(insn, Instruction):
+                return insn
             if isinstance(insn, str):
                 lhs, rhs = insn.split("=")
             elif isinstance(insn, tuple):
@@ -213,22 +442,35 @@ class LoopKernel(Record):
 
             if isinstance(lhs, str):
                 lhs = parse(lhs)
+
             if isinstance(rhs, str):
-                from loopy.symbolic import CSERealizer
-                rhs = CSERealizer()(parse(rhs))
+                from loopy.symbolic import FunctionToPrimitiveMapper
+                rhs = parse(rhs)
+                rhs = FunctionToPrimitiveMapper()(rhs)
 
-            return lhs, rhs
+            return Instruction(
+                    id=self.make_unique_instruction_id(insns),
+                    assignee=lhs, expression=rhs)
 
         if isinstance(domain, str):
             ctx = isl.Context()
             domain = isl.Set.read_from_str(ctx, domain, nparam=-1)
 
-        insns = [parse_if_necessary(insn) for insn in instructions]
+        if name_to_dim is None:
+            name_to_dim = domain.get_space().get_var_dict()
+
+        insns = []
+        for insn in instructions:
+            # must construct list one-by-one to facilitate unique id generation
+            insns.append(parse_if_necessary(insn))
+
+        if len(set(insn.id for insn in insns)) != len(insns):
+            raise RuntimeError("instruction ids do not appear to be unique")
 
         if assumptions is None:
-            assumptions = isl.Set.universe(domain.get_dim())
+            assumptions = isl.Set.universe(domain.get_space())
         elif isinstance(assumptions, str):
-            s = domain.get_dim()
+            s = domain.get_space()
             assumptions = isl.BasicSet.read_from_str(domain.get_ctx(),
                     "[%s] -> {[%s]: %s}"
                     % (",".join(s.get_name(dim_type.param, i)
@@ -239,24 +481,50 @@ class LoopKernel(Record):
                        nparam=-1)
 
         Record.__init__(self,
-                device=device, args=args, domain=domain, instructions=insns,
-                prefetch=prefetch, schedule=schedule,
-                register_prefetch=register_prefetch, name=name,
-                iname_to_tag=dict(
-                    (iname, parse_tag(tag))
-                    for iname, tag in iname_to_tag.iteritems()),
-                is_divisible=is_divisible,
+                device=device,  domain=domain, instructions=insns,
+                args=args,
+                schedule=schedule,
+                name=name,
                 preamble=preamble,
                 assumptions=assumptions,
-                iname_slab_increments=iname_slab_increments)
+                iname_slab_increments=iname_slab_increments,
+                temporary_variables=temporary_variables,
+                workgroup_size=workgroup_size,
+                name_to_dim=name_to_dim)
+
+    def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
+        if insns is None:
+            insns = self.instructions
+
+        used_ids = set(insn.id for insn in insns) | extra_used_ids
+
+        from loopy.tools import generate_unique_possibilities
+        for id_str in generate_unique_possibilities(based_on):
+            if id_str not in used_ids:
+                return id_str
+
+    def make_unique_var_name(self, based_on="var", extra_used_vars=set()):
+        used_vars = (
+                set(lv.name for lv in self.temporary_variables)
+                | set(arg.name for arg in self.args)
+                | set(self.name_to_dim.keys())
+                | extra_used_vars)
+
+        from loopy.tools import generate_unique_possibilities
+        for var_name in generate_unique_possibilities(based_on):
+            if var_name not in used_vars:
+                return var_name
 
-        # FIXME: assert empty intersection of loop vars and args
-        # FIXME: assert that isl knows about loop parameters
+    @property
+    @memoize_method
+    def dim_to_name(self):
+        from pytools import reverse_dict
+        return reverse_dict(self.name_to_dim)
 
     @property
     @memoize_method
     def space(self):
-        return self.domain.get_dim()
+        return self.domain.get_space()
 
     @property
     @memoize_method
@@ -287,22 +555,6 @@ class LoopKernel(Record):
         from islpy import dim_type
         return set(self.space.get_var_dict(dim_type.set).iterkeys())
 
-    @memoize_method
-    def output_inames(self):
-        dm = DependencyMapper(include_subscripts=False)
-
-        output_indices = set()
-        for lvalue, expr in self.instructions:
-            output_indices.update(
-                    set(v.name for v in dm(lvalue))
-                    & self.all_inames())
-
-        return output_indices - set(arg.name for arg in self.args)
-
-    @memoize_method
-    def reduction_inames(self):
-        return self.all_inames() - self.output_inames()
-
     def inames_by_tag_type(self, tag_type):
         return [iname for iname in self.all_inames()
                 if isinstance(self.iname_to_tag.get(iname), tag_type)]
@@ -360,216 +612,18 @@ class LoopKernel(Record):
         return s
 
     def local_mem_use(self):
-        return sum(pf.nbytes for pf in self.prefetch.itervalues())
-
-    @memoize_method
-    def input_vectors(self):
-        dm = DependencyMapper(include_subscripts=False)
-
-        input_vectors = set()
-        for lvalue, expr in self.instructions:
-            input_vectors.update(
-                    set(v.name for v in dm(expr)))
-
-        return input_vectors - self.all_inames() - set(self.scalar_loop_args)
-
-    @memoize_method
-    def output_vectors(self):
-        dm = DependencyMapper(include_subscripts=False)
-
-        output_vectors = set()
-        for lvalue, expr in self.instructions:
-            output_vectors.update(
-                    set(v.name for v in dm(lvalue)))
-
-        return output_vectors - self.all_inames() - self.scalar_loop_args
-
-    def _subst_insns(self, old_var, new_expr):
-        from pymbolic.mapper.substitutor import substitute
-
-        subst_map = {old_var: new_expr}
-
-        return [(substitute(lvalue, subst_map),
-            substitute(expr, subst_map))
-            for lvalue, expr in self.instructions]
-
-    def is_prefetch_variable(self, varname):
-        if self.prefetch:
-            for pf in self.prefetch.itervalues():
-                for pfdim in pf.dims:
-                    if pfdim.name == varname:
-                        return True
-
-        return False
-
-    def _subst_prefetch(self, old_var, new_expr):
-        # FIXME delete me
-        from pymbolic.mapper.substitutor import substitute
-        subst_map = {old_var: new_expr}
-
-        result = {}
-        for pf in self.prefetch.itervalues():
-            for pfdim in pf.dims:
-                if pfdim.name == old_var:
-                    raise RuntimeError("can't substitute prefetch dimension %s"
-                            % old_var)
-
-            new_pf = pf.copy(index_expr=substitute(pf.index_expr, subst_map))
-            result[pf.input_vector, new_pf.index_expr] = new_pf
-
-        return result
+        return sum(lv.nbytes for lv in self.temporary_variables
+                if lv.is_local)
 
     def substitute(self, old_var, new_expr):
-        copy = self.copy(instructions=self._subst_insns(old_var, new_expr))
-        if self.prefetch:
-            raise RuntimeError("cannot substitute-prefetches already generated")
-            #copy.prefetch = self._subst_prefetch(old_var, new_expr)
-
         if self.schedule is not None:
             raise RuntimeError("cannot substitute-schedule already generated")
 
-        return copy
-
-    def split_dimension(self, name, inner_length, padded_length=None,
-            outer_name=None, inner_name=None,
-            outer_tag=None, inner_tag=None,
-            outer_slab_increments=(0, -1), no_slabs=None):
-
-        if name not in self.all_inames():
-            raise ValueError("cannot split loop for unknown variable '%s'" % name)
-
-        if no_slabs:
-            outer_slab_increments = (0, 0)
-
-        outer_tag = parse_tag(outer_tag)
-        inner_tag = parse_tag(inner_tag)
-
-        if self.iname_to_tag.get(name) is not None:
-            raise RuntimeError("cannot split tagged dimension '%s'" % name)
-
-        # {{{ check for repeated unique tag keys
-
-        new_tag_keys = set(tag.key
-                for tag in [outer_tag, inner_tag]
-                if tag is not None
-                if isinstance(tag, UniqueTag))
-
-        repeated_tag_keys = new_tag_keys & set(
-                tag.key for tag in self.iname_to_tag.itervalues()
-                if isinstance(tag, UniqueTag))
-
-        if repeated_tag_keys:
-            raise RuntimeError("repeated tag(s): %s" % repeated_tag_keys)
-
-        # }}}
-
-        if padded_length is not None:
-            inner_tag = inner_tag.copy(forced_length=padded_length)
-
-        if outer_name is None:
-            outer_name = name+"_outer"
-        if inner_name is None:
-            inner_name = name+"_inner"
-
-        new_iname_to_tag = self.iname_to_tag.copy()
-        if inner_tag is not None:
-            new_iname_to_tag[inner_name] = inner_tag
-        if outer_tag is not None:
-            new_iname_to_tag[outer_name] = outer_tag
-
-        from islpy import dim_type
-        outer_var_nr = self.space.size(dim_type.set)
-        inner_var_nr = self.space.size(dim_type.set)+1
-
-        def process_set(s):
-            s = s.add_dims(dim_type.set, 2)
-            s.set_dim_name(dim_type.set, outer_var_nr, outer_name)
-            s.set_dim_name(dim_type.set, inner_var_nr, inner_name)
-
-            from loopy.isl import make_slab
-
-            space = s.get_dim()
-            inner_constraint_set = (
-                    make_slab(space, inner_name, 0, inner_length)
-                    # name = inner + length*outer
-                    .add_constraint(isl.Constraint.eq_from_names(
-                        space, {name:1, inner_name: -1, outer_name:-inner_length})))
-
-            name_dim_type, name_idx = space.get_var_dict()[name]
-            return (s
-                    .intersect(inner_constraint_set)
-                    .project_out(name_dim_type, name_idx, 1))
-
-        new_domain = process_set(self.domain) 
-        new_assumptions = process_set(self.assumptions)
-
-        from pymbolic import var
-        inner = var(inner_name)
-        outer = var(outer_name)
-        new_loop_index = inner + outer*inner_length
-
-        iname_slab_increments = self.iname_slab_increments.copy()
-        iname_slab_increments[outer_name] = outer_slab_increments
-        return (self
-                .substitute(name, new_loop_index)
-                .copy(domain=new_domain, iname_to_tag=new_iname_to_tag,
-                    assumptions=new_assumptions,
-                    iname_slab_increments=iname_slab_increments))
-
-    def get_problems(self, parameters, emit_warnings=True):
-        """
-        :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5.
-            '5' means 'will certainly not run'.
-        """
-        msgs = []
-
-        def msg(severity, s):
-            if emit_warnings:
-                from warnings import warn
-                from loopy import LoopyAdvisory
-                warn(s, LoopyAdvisory)
-
-            msgs.append((severity, s))
-
-        glens = self.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True)
-        llens = self.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False)
-
-        from pymbolic import evaluate
-        glens = evaluate(glens, parameters)
-        llens = evaluate(llens, parameters)
-
-        if (max(len(glens), len(llens))
-                > self.device.max_work_item_dimensions):
-            msg(5, "too many work item dimensions")
-
-        for i in range(len(llens)):
-            if llens[i] > self.device.max_work_item_sizes[i]:
-                msg(5, "group axis %d too big" % i)
-
-        from pytools import product
-        if product(llens) > self.device.max_work_group_size:
-            msg(5, "work group too big")
-
-        from pyopencl.characterize import usable_local_mem_size
-        if self.local_mem_use() > usable_local_mem_size(self.device):
-            if self.device.local_mem_type == cl.device_local_mem_type.LOCAL:
-                msg(5, "using too much local memory")
-            else:
-                msg(4, "using more local memory than available--"
-                        "possibly OK due to cache nature")
-
-        const_arg_count = sum(
-                1 for arg in self.args
-                if isinstance(arg, ArrayArg) and arg.constant_mem)
-
-        if const_arg_count > self.device.max_constant_args:
-            msg(5, "too many constant arguments")
-
-        max_severity = 0
-        for sev, msg in msgs:
-            max_severity = max(sev, max_severity)
-        return max_severity, msgs
+        return self.copy(instructions=[
+            insn.substitute(old_var, new_expr)
+            for insn in self.instructions])
 
 # }}}
 
+
 # vim: foldmethod=marker
diff --git a/loopy/schedule.py b/loopy/schedule.py
index ee5cab82a..08729bbe5 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -7,17 +7,113 @@ from pytools import Record
 
 # {{{ schedule items
 
-class ScheduledLoop(Record):
+class EnterLoop(Record):
     __slots__ = ["iname"]
 
-class WriteOutput(Record):
-    pass
+class LeaveLoop(Record):
+    __slots__ = []
 
-# plus loopy.prefetch.{Register,LocalMemory}Prefetch
+class RunInstruction(Record):
+    __slots__ = ["id"]
+
+class Barrier(Record):
+    __slots__ = []
 
 # }}}
 
-def generate_loop_schedules(kernel, hints=[]):
+
+
+
+def fix_grid_sizes(kernel):
+    from warnings import warn
+    warn("fix_grid_sizes is unimplemented")
+    return kernel
+
+
+
+
+def generate_loop_dep_graph(kernel):
+    """
+    :return: a dict mapping an iname to the ones that need to be entered
+        before it.
+    """
+    result = {}
+
+    print "------------------------------------------------------"
+    for i, insn_a in enumerate(kernel.instructions):
+        print i, insn_a
+        print insn_a.all_inames()
+
+    print "------------------------------------------------------"
+    all_inames = kernel.all_inames()
+    for i_a, insn_a in enumerate(kernel.instructions):
+        for i_b, insn_b in enumerate(kernel.instructions):
+            if i_a == i_b:
+                continue
+
+            a = insn_a.all_inames()
+            b = insn_b.all_inames()
+            intersection = a & b
+            sym_difference = (a|b) - intersection
+
+            print i_a, i_b, intersection, sym_difference
+            if a <= b or b <= a:
+                for sd in sym_difference:
+                    result.setdefault(sd, set()).update(intersection)
+
+    print "------------------------------------------------------"
+    return result
+
+
+
+
+def generate_loop_schedules_internal(kernel, entered_loops=[]):
+    scheduled_insn_ids = set(sched_item.id for sched_item in kernel.schedule
+            if isinstance(sched_item, RunInstruction))
+
+    all_inames = kernel.all_inames()
+
+
+
+
+def generate_loop_schedules(kernel):
+    # {{{ check that all CSEs and reductions are realized
+
+    from loopy.symbolic import CSECallbackMapper, ReductionCallbackMapper
+
+    def map_reduction(expr, rec):
+        raise RuntimeError("all reductions must be realized before scheduling")
+
+    def map_cse(expr, rec):
+        raise RuntimeError("all CSEs must be realized before scheduling")
+
+    for insn in kernel.instructions:
+        ReductionCallbackMapper(map_reduction)(insn.expression)
+        CSECallbackMapper(map_cse)(insn.expression)
+
+    # }}}
+
+    kernel = fix_grid_sizes(kernel)
+
+    if 0:
+        loop_dep_graph = generate_loop_dep_graph(kernel)
+        for k, v in loop_dep_graph.iteritems():
+            print "%s: %s" % (k, ",".join(v))
+        1/0
+
+    #grid_size, group_size = find_known_grid_and_group_sizes(kernel)
+
+    #kernel = assign_grid_and_group_indices(kernel)
+
+    for gen_knl in generate_loop_schedules_internal(kernel):
+        yield gen_knl
+
+
+
+
+
+def generate_loop_schedules_old(kernel, hints=[]):
+    # OLD!
     from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX, TAG_ILP, ParallelTag
 
     prev_schedule = kernel.schedule
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b928d4b31..69637268c 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -2,33 +2,157 @@
 
 from __future__ import division
 
-from pymbolic.mapper import CombineMapper, RecursiveMapper, IdentityMapper
+from pymbolic.primitives import AlgebraicLeaf
+from pymbolic.mapper import (
+        CombineMapper as CombineMapperBase,
+        IdentityMapper as IdentityMapperBase,
+        RecursiveMapper)
 from pymbolic.mapper.c_code import CCodeMapper
 from pymbolic.mapper.stringifier import PREC_NONE
+from pymbolic.mapper.substitutor import \
+        SubstitutionMapper as SubstitutionMapperBase
+from pymbolic.mapper.stringifier import \
+        StringifyMapper as StringifyMapperBase
 import numpy as np
 import islpy as isl
+from islpy import dim_type
 
 
 
+# {{{ loopy-specific primitives
 
-# {{{ subscript expression collector
+class Reduction(AlgebraicLeaf):
+    def __init__(self, operation, iname, expr, tag=None):
+        self.operation = operation
+        self.iname = iname
+        self.expr = expr
+        self.tag = tag
 
-class AllSubscriptExpressionCollector(CombineMapper):
-    def combine(self, values):
-        from pytools import flatten
-        return set(flatten(values))
+    def __getinitargs__(self):
+        return (self.operation, self.iname, self.expr, self.tag)
 
-    def map_constant(self, expr):
-        return set()
+    def get_hash(self):
+        return hash((self.__class__, self.operation, self.iname,
+            self.expr, self.tag))
 
-    def map_algebraic_leaf(self, expr):
-        return set()
+    def is_equal(self, other):
+        return (other.__class__ == self.__class__
+                and other.operation == self.operation
+                and other.iname == self.iname
+                and other.expr == self.expr
+                and other.tag == self.tag)
 
-    def map_subscript(self, expr):
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_reduction")
+
+# }}}
+
+# {{{ mappers with support for loopy-specific primitives
+
+class IdentityMapperMixin(object):
+    def map_reduction(self, expr):
+        return Reduction(expr.operation, expr.iname,
+                self.rec(expr.expr), expr.tag)
+
+class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
+    pass
+
+class CombineMapper(CombineMapperBase):
+    def map_reduction(self, expr):
+        return self.rec(expr.expr)
+
+class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin):
+    pass
+
+class StringifyMapper(StringifyMapperBase):
+    def map_reduction(self, expr, prec):
+        return "reduce(%s, %s, %s, tag=%s)" % (
+                expr.operation, expr.iname, expr.expr, expr.tag)
+
+# }}}
+
+# {{{ functions to primitives
+
+class FunctionToPrimitiveMapper(IdentityMapper):
+    """Looks for invocations of a function called 'cse' or 'reduce' and 
+    turns those into the actual pymbolic primitives used for that.
+    """
+
+    def map_call(self, expr):
         from pymbolic.primitives import Variable
-        assert isinstance(expr.aggregate, Variable)
+        if isinstance(expr.function, Variable) and expr.function.name == "cse":
+            from pymbolic.primitives import CommonSubexpression
+            if len(expr.parameters) == 2:
+                if not isinstance(expr.parameters[1], Variable):
+                    raise TypeError("second argument to cse() must be a symbol")
+                return CommonSubexpression(
+                        expr.parameters[0], expr.parameters[1].name)
+            else:
+                raise TypeError("cse takes two arguments")
+
+        elif isinstance(expr.function, Variable) and expr.function.name == "reduce":
+            if len(expr.parameters) == 3:
+                operation, iname, red_expr = expr.parameters
+                tag = None
+            elif len(expr.parameters) == 4:
+                operation, iname, red_expr, tag = expr.parameters
+            else:
+                raise TypeError("reduce takes three or four arguments")
+
+            red_expr = self.rec(red_expr)
+
+            if not isinstance(operation, Variable):
+                raise TypeError("operation argument to reduce() must be a symbol")
+            operation = operation.name
+            if not isinstance(iname, Variable):
+                raise TypeError("iname argument to reduce() must be a symbol")
+            iname = iname.name
+            if tag is not None:
+                if  not isinstance(tag, Variable):
+                    raise TypeError("tag argument to reduce() must be a symbol")
+                tag = tag.name
+
+            return Reduction(operation, iname, red_expr, tag)
+        else:
+            return IdentityMapper.map_call(self, expr)
 
-        return set([expr])
+# }}}
+
+# {{{ reduction loop splitter
+
+class ReductionLoopSplitter(IdentityMapper):
+    def __init__(self, old_iname, outer_iname, inner_iname, do_warn=True):
+        self.old_iname = old_iname
+        self.outer_iname = outer_iname
+        self.inner_iname = inner_iname
+        self.do_warn = do_warn
+
+    def map_reduction(self, expr):
+        if expr.iname == self.old_iname:
+            if self.do_warn:
+                from warnings import warn
+                from loopy import LoopyAdvisory
+                warn("The reduction loop for iname '%s' (with tag '%s') "
+                        "was split into two nested reductions with separate "
+                        "state variables. It might be advantageous to "
+                        "'realize' the reduction first before splitting "
+                        "this loop." % (expr.iname, expr.tag), LoopyAdvisory)
+
+            if expr.tag is not None:
+                outer_tag = expr.tag + "_outer"
+                inner_tag = expr.tag + "_inner"
+            else:
+                outer_tag = None
+                inner_tag = None
+
+            return Reduction(expr.operation, self.outer_iname,
+                    Reduction(expr.operation, self.inner_iname,
+                        expr.expr, inner_tag),
+                    outer_tag)
+        else:
+            return IdentityMapper.map_reduction(self, expr)
 
 # }}}
 
@@ -93,7 +217,7 @@ class CoefficientCollector(RecursiveMapper):
 # {{{ variable index expression collector
 
 class VariableIndexExpressionCollector(CombineMapper):
-    def __init__(self, tgt_vector_name):
+    def __init__(self, tgt_vector_name=None):
         self.tgt_vector_name = tgt_vector_name
 
     def combine(self, values):
@@ -110,8 +234,8 @@ class VariableIndexExpressionCollector(CombineMapper):
         from pymbolic.primitives import Variable
         assert isinstance(expr.aggregate, Variable)
 
-        if expr.aggregate.name == self.tgt_vector_name:
-            return set([expr.index])
+        if self.tgt_vector_name is None or expr.aggregate.name == self.tgt_vector_name:
+            return set([expr.index]) | self.rec(expr.index)
         else:
             return CombineMapper.map_subscript(self, expr)
 
@@ -246,62 +370,113 @@ class LoopyCCodeMapper(CCodeMapper):
 
 # }}}
 
-# {{{ expression <-> constraint conversion
+# {{{ aff -> expr conversion
+
+def aff_to_expr(aff):
+    from pymbolic import var
 
-def _constraint_from_expr(space, expr, constraint_factory):
-    from loopy.symbolic import CoefficientCollector
-    return constraint_factory(space,
-            CoefficientCollector()(expr))
+    result = int(aff.get_constant())
+    for dt in [dim_type.in_, dim_type.param]:
+        for i in xrange(aff.dim(dim_type.in_)):
+            coeff = int(aff.get_coefficient(dt, i))
+            if coeff:
+                result += coeff*var(aff.get_dim_name(dt, i))
+
+    for i in xrange(aff.dim(dim_type.div)):
+        coeff = int(aff.get_coefficient(dim_type.div, i))
+        if coeff:
+            result += coeff*aff_to_expr(aff.get_div(i))
+
+    denom = aff.get_denominator()
+    if denom == 1:
+        return result
+    else:
+        return result // denom
+
+
+
+
+def pw_aff_to_expr(pw_aff):
+    pieces = pw_aff.get_pieces()
+
+    if len(pieces) != 1:
+        raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances")
+
+    (set, aff), = pieces
+    return aff_to_expr(aff)
+
+def aff_from_expr(space, expr):
+    n = space.dim(dim_type.set)
+
+    zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
+    context = {}
+    for name, (dt, pos) in space.get_var_dict().iteritems():
+        if dt == dim_type.set:
+            dt = dim_type.in_
+
+        context[name] = zero.set_coefficient(dt, pos, 1)
+
+    from pymbolic import evaluate
+    return evaluate(expr, context)
+
+# }}}
+
+# {{{ expression <-> constraint conversion
 
 def eq_constraint_from_expr(space, expr):
-    return _constraint_from_expr(
-            space, expr, isl.Constraint.eq_from_names)
+    return isl.Constraint.equality_from_aff(aff_from_expr(space,expr))
 
 def ineq_constraint_from_expr(space, expr):
-    return _constraint_from_expr(
-            space, expr, isl.Constraint.ineq_from_names)
+    return isl.Constraint.inequality_from_aff(aff_from_expr(space,expr))
 
 def constraint_to_expr(cns, except_name=None):
-    excepted_coeff = 0
-    result = 0
-    from pymbolic import var
-    for var_name, coeff in cns.get_coefficients_by_name().iteritems():
-        if isinstance(var_name, str):
-            if var_name == except_name:
-                excepted_coeff = int(coeff)
-            else:
-                result += int(coeff)*var(var_name)
-        else:
-            result += int(coeff)
+    return aff_to_expr(cns.get_aff())
 
-    if except_name is not None:
-        return result, excepted_coeff
-    else:
+# }}}
+
+# {{{ CSE callback mapper
+
+class CSECallbackMapper(IdentityMapper):
+    def __init__(self, callback):
+        self.callback = callback
+
+    def map_common_subexpression(self, expr):
+        result = self.callback(expr, self.rec)
+        if result is None:
+            return IdentityMapper.map_common_subexpression(self, expr)
         return result
 
 # }}}
 
-# {{{ CSE "realizer"
+# {{{ Reduction callback mapper
 
-class CSERealizer(IdentityMapper):
-    """Looks for invocations of a function called 'cse' and turns those into
-    CommonSubexpression objects.
-    """
+class ReductionCallbackMapper(IdentityMapper):
+    def __init__(self, callback):
+        self.callback = callback
 
-    def map_call(self, expr):
-        from pymbolic import Variable
-        if isinstance(expr.function, Variable) and expr.function.name == "cse":
-            from pymbolic.primitives import CommonSubexpression
-            if len(expr.parameters) == 1:
-                return CommonSubexpression(expr.parameters[0])
-            else:
-                raise TypeError("cse takes exactly one argument")
-        else:
-            return IdentityMapper.map_call(self, expr)
+    def map_reduction(self, expr):
+        result = self.callback(expr, self.rec)
+        if result is None:
+            return IdentityMapper.map_reduction(self, expr)
+        return result
+
+# }}}
+
+# {{{ index dependency finding
+
+class IndexVariableFinder(CombineMapper):
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, set())
+
+    def map_subscript(self, expr):
+        from pymbolic.mapper.dependency import DependencyMapper
+        return DependencyMapper()(expr)
 
 # }}}
 
 
 
 
+
 # vim: foldmethod=marker
diff --git a/loopy/tools.py b/loopy/tools.py
new file mode 100644
index 000000000..95d4e4e41
--- /dev/null
+++ b/loopy/tools.py
@@ -0,0 +1,13 @@
+from __future__ import division
+
+
+
+
+def generate_unique_possibilities(prefix):
+    yield prefix
+
+    try_num = 0
+    while True:
+        yield "%s_%d" % (prefix, try_num)
+        try_num += 1
+
diff --git a/test/test_isl.py b/test/test_isl.py
new file mode 100644
index 000000000..c6925355a
--- /dev/null
+++ b/test/test_isl.py
@@ -0,0 +1,27 @@
+import islpy as isl
+
+
+
+
+def test_aff_to_expr():
+    s = isl.Space.create_from_names(isl.Context(), ["a", "b"])
+    zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(s))
+    one = zero.set_constant(1)
+    a = zero.set_coefficient(isl.dim_type.in_, 0, 1)
+    b = zero.set_coefficient(isl.dim_type.in_, 1, 1)
+
+    x = (5*a + 3*b) % 17 % 5
+    print x
+    from loopy.symbolic import aff_to_expr
+    print aff_to_expr(x)
+
+
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 0ea8b86f8..b58edc582 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -201,14 +201,15 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
     n = get_suitable_size(ctx)
 
     knl = lp.LoopKernel(ctx.devices[0],
-            "{[i,j,k]: 0<=i,j,k<%d}" % n,
+            "[n] -> {[i,j,k]: 0<=i,j,k<n}",
             [
-                "c[i, j] = reduce(sum, k, cse(a[i, k], 'lhsmat')*cse(b[k, j], 'rhsmat'))"
+                "c[i, j] = reduce(sum, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),
                 lp.ArrayArg("b", dtype, shape=(n, n), order=order),
                 lp.ArrayArg("c", dtype, shape=(n, n), order=order),
+                lp.ScalarArg("n", np.int32, approximately=n),
                 ],
             name="matmul")
 
@@ -216,13 +217,24 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
             outer_tag="g.0", inner_tag="l.1", no_slabs=True)
     knl = lp.split_dimension(knl, "j", 16,
             outer_tag="g.1", inner_tag="l.0", no_slabs=True)
+    for insn in knl.instructions:
+        print insn
+    print
+    knl = lp.realize_reduction(knl, "k", dtype)
+    for insn in knl.instructions:
+        print insn
+    print
     knl = lp.split_dimension(knl, "k", 16, no_slabs=True)
-    #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
-    #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
-    assert knl.get_problems({})[0] <= 2
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl))
+    knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"])
+    knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])
+
+    for insn in knl.instructions:
+        print insn
+
+    #assert lp.get_problems(knl, {})[0] <= 2
+
+    kernel_gen = lp.generate_loop_schedules(knl)
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-- 
GitLab