diff --git a/examples/quadrature.py b/examples/quadrature.py
index 9bf4e6f21f6b1ecd5b234dd31a0864dba372f09a..c39d6c99ab7461ffe8dfe63f08ded986f8b45baa 100644
--- a/examples/quadrature.py
+++ b/examples/quadrature.py
@@ -34,17 +34,14 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context):
     Nb = 3
     Nv = 3
     Nq = 3*3
-
     Nc = 1600
-    from pymbolic import var
-    m, w, det_j, phi, c, i, j, q = [var(s) for s in "m w det_j phi c i j q".split()]
 
     knl = lp.LoopKernel(ctx.devices[0],
             "[ncells] -> {[c,i,j,q]: 0<=c<ncells and 0 <= i < %(Nv)s "
             "and 0<=j<%(Nb)s and 0<=q<%(Nq)s}" % dict(
                 Nv=Nv, Nb=Nb, Nq=Nq),
             [
-                (m[c, i, j], w[q]*det_j[c]*phi[i,q]*phi[j,q])
+                "m[c,i,j] = w[q]*det_j[c]*phi[i,q]*phi[j,q]",
                 ],
             [
                 lp.ArrayArg("m", dtype, shape=(Nc, Nv, Nb)),
@@ -54,17 +51,16 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context):
                 lp.ScalarArg("ncells", np.int32, approximately=1000),
                 ],
             name="mass_mat",
-            iname_to_tag=dict(i="l.0", j="l.1")
+            iname_to_tag=dict(i="l.0", j="l.1"),
+            assumptions="ncells >= 1"
             )
-    knl = lp.split_dimension(knl, "c", 8, outer_tag="g.0", inner_tag="l.2")
-    knl = lp.add_prefetch(knl, "det_j", ["c_inner"])
+    knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2", 
+            outer_slab_increments=(0,0))
+    knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0",
+            outer_slab_increments=(0,0))
 
     # fix reg prefetch
-    # fix redundant slab generation
-
-    # FIXME
-    #knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2")
-    #knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0")
+    knl = lp.add_prefetch(knl, "det_j", ["c_inner"])
 
     #ilp = 4
     #knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1")
@@ -77,7 +73,8 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context):
     #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"])
     #assert knl.get_problems({})[0] <= 2
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
+    kernel_gen = (knl
+    #kernel_gen = (lp.insert_register_prefetches(knl)
             for knl in lp.generate_loop_schedules(knl))
 
     if False:
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 147efabf99ab20e57ee88c19cfc1bc06c9f96653..46bc404e5ee3230429e64db6e68aefb2d67973ac 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -21,24 +21,23 @@ register_mpz_with_pymbolic()
 # TODO: Custom reductions per red. axis
 # TODO: Functions
 # TODO: Common subexpressions
-# TODO: Parse ops from string
+# TODO: Array common subexpressions
 # FIXME: support non-reductive dimensions (what did I mean here?)
 # FIXME: write names should be assigned during scheduling
+# FIXME: screwy lower bounds in ILP
 
 # TODO: Divisibility
 # TODO: Try, fix indirect addressing
-# TODO: More user control for slab opt
 
 # TODO: Implement GT200 matmul, Fermi matmul, DG
 # TODO: DMA engine threads?
-# TODO: Specify initial implemented domain.
-#   (to filter away unnecessary conditions on parameters)
 # TODO: Deal with equalities that crop up.
+# TODO: Better user feedback.
 
 # Later:
 # ------
 # TODO: Try different kernels
-# TODO:   - Tricky: Convolution, FD
+# TODO:   - Tricky: Convolution, Stencil
 # TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?)
 # TODO: implement efficient ceil_div? (as opposed to floor_div)
 # TODO: why are corner cases inefficient?
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index e8f10a1a8d741c0e875a55e8c6ccce59305260f7..6b9cb966fbd2aafbe94617e16edf6626e773def3 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -2,7 +2,6 @@ from __future__ import division
 
 from pytools import Record
 import numpy as np
-from pymbolic.mapper.stringifier import PREC_NONE
 
 
 
@@ -95,50 +94,56 @@ def add_comment(cmt, code):
 
 # {{{ main code generation entrypoint
 
-class CodeGenerationState(Record):
-    __slots__ = ["c_code_mapper", "try_slab_partition"]
+class ExecutionSubdomain(Record):
+    __slots__ = ["implemented_domain", "c_code_mapper"]
+
+    def __init__(self, implemented_domain, c_code_mapper):
+        Record.__init__(self,
+                implemented_domain=implemented_domain,
+                c_code_mapper=c_code_mapper)
+
+    def intersect(self, set):
+        return ExecutionSubdomain(
+                self.implemented_domain.intersect(set),
+                self.c_code_mapper)
 
 class ExecutionDomain(object):
-    def __init__(self, implemented_domain, assignments_and_impl_domains=None):
+    def __init__(self, implemented_domain, c_code_mapper, subdomains=None):
         """
         :param implemented_domain: The entire implemented domain,
             i.e. all constraints that have been enforced so far.
-        :param assignments_and_impl_domains: a list of tuples 
-            (assignments, implemented_domain), where *assignments*
-            is a list of :class:`cgen.Initializer` instances
-            and *implemented_domain* is the implemented domain to which
-            the situation produced by the assignments corresponds.
+        :param subdomains: a list of :class:`ExecutionSubdomain`
+            instances.
 
-            The point of this being is a list is the implementation of
+            The point of this being a list is the implementation of
             ILP, and each entry represents a 'fake-parallel' trip through the 
-            ILP'd loop.
+            ILP'd loop, with the requisite implemented_domain
+            and a C code mapper that realizes the necessary assignments.
+        :param c_code_mapper: A C code mapper that does not take per-ILP
+            assignments into account.
         """
-        if assignments_and_impl_domains is None:
-            assignments_and_impl_domains = [([], implemented_domain)]
         self.implemented_domain = implemented_domain
-        self.assignments_and_impl_domains = assignments_and_impl_domains
-
-    def __len__(self):
-        return len(self.assignments_and_impl_domains)
+        if subdomains is None:
+            self.subdomains = [
+                    ExecutionSubdomain(implemented_domain, c_code_mapper)]
+        else:
+            self.subdomains = subdomains
 
-    def __iter__(self):
-        return iter(self.assignments_and_impl_domains)
+        self.c_code_mapper = c_code_mapper
 
     def intersect(self, set):
         return ExecutionDomain(
                 self.implemented_domain.intersect(set),
-                [(assignments, implemented_domain.intersect(set))
-                for assignments, implemented_domain
-                in self.assignments_and_impl_domains])
+                self.c_code_mapper,
+                [subd.intersect(set) for subd in self.subdomains])
 
     def get_the_one_domain(self):
-        assert len(self.assignments_and_impl_domains) == 1
+        assert len(self.subdomains) == 1
         return self.implemented_domain
 
 
 
 
-
 def generate_code(kernel):
     from loopy.codegen.prefetch import preprocess_prefetch
     kernel = preprocess_prefetch(kernel)
@@ -151,10 +156,7 @@ def generate_code(kernel):
             CLLocal, CLImage, CLConstant)
 
     from loopy.symbolic import LoopyCCodeMapper
-    my_ccm = LoopyCCodeMapper(kernel)
-
-    def ccm(expr, prec=PREC_NONE):
-        return my_ccm(expr, prec)
+    ccm = LoopyCCodeMapper(kernel)
 
     # {{{ build top-level
 
@@ -266,13 +268,10 @@ def generate_code(kernel):
 
     # }}}
 
-    cgs = CodeGenerationState(c_code_mapper=ccm, try_slab_partition=True)
-
     from loopy.codegen.dispatch import build_loop_nest
-    import islpy as isl
 
-    gen_code = build_loop_nest(cgs, kernel, 0, 
-            ExecutionDomain(isl.Set.universe(kernel.space)))
+    gen_code = build_loop_nest(kernel, 0,
+            ExecutionDomain( kernel.assumptions, c_code_mapper=ccm))
     body.extend([Line(), gen_code.ast])
     #print "# conditionals: %d" % gen_code.num_conditionals
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index e602e951e2994a63b4de060d6c14e96ef17f9814..4c509d4c729b6ab44fdbc980e038a7f2745a835b 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -2,7 +2,7 @@ from __future__ import division
 
 import islpy as isl
 from islpy import dim_type
-from pymbolic.mapper.stringifier import PREC_NONE
+import numpy as np
 
 
 
@@ -192,45 +192,61 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt):
     cfm = CommutativeConstantFoldingMapper()
 
     from loopy.symbolic import constraint_to_expr
-    from pytools import any
 
-    if any(cns.is_equality() for cns in constraints):
-        raise NotImplementedError("equality constraints for 'for' loops")
-    else:
-        start_exprs = []
-        end_conds = []
-
-        for cns in constraints:
-            rhs, iname_coeff = constraint_to_expr(cns, except_name=iname)
-
-            if iname_coeff == 0:
-                continue
-            elif iname_coeff < 0:
-                from pymbolic import var
-                rhs += iname_coeff*var(iname)
-                end_conds.append("%s >= 0" %
-                        ccm(cfm(expand(rhs))))
-            else: #  iname_coeff > 0
-                kind, bound = solve_constraint_for_bound(cns, iname)
-                assert kind == ">="
-                start_exprs.append(bound)
-
-    if len(start_exprs) > 1:
-        from pymbolic.primitives import Max
-        start_expr = Max(start_exprs)
-    elif len(start_exprs) == 1:
-        start_expr, = start_exprs
-    else:
-        raise RuntimeError("no starting value found for 'for' loop in '%s'"
-                % iname)
+    start_exprs = []
+    end_conds = []
+    equality_exprs = []
 
-    from cgen import For
-    from loopy.codegen import wrap_in
-    return wrap_in(For,
-            "int %s = %s" % (iname, ccm(start_expr, PREC_NONE)),
-            " && ".join(end_conds),
-            "++%s" % iname,
-            stmt)
+    for cns in constraints:
+        rhs, iname_coeff = constraint_to_expr(cns, except_name=iname)
+
+        if iname_coeff == 0:
+            continue
+
+        if cns.is_equality():
+            kind, bound = solve_constraint_for_bound(cns, iname)
+            assert kind == "=="
+            equality_exprs.append(bound)
+        elif iname_coeff < 0:
+            from pymbolic import var
+            rhs += iname_coeff*var(iname)
+            end_conds.append("%s >= 0" %
+                    ccm(cfm(expand(rhs))))
+        else: #  iname_coeff > 0
+            kind, bound = solve_constraint_for_bound(cns, iname)
+            assert kind == ">="
+            start_exprs.append(bound)
+
+    if equality_exprs:
+        assert len(equality_exprs) == 1
+
+        equality_expr, = equality_exprs
+
+        from loopy.codegen import gen_code_block
+        from cgen import Initializer, POD, Const, Line
+        return gen_code_block([
+            Initializer(Const(POD(np.int32, iname)),
+                ccm(equality_expr)),
+            Line(),
+            stmt,
+            ])
+    else:
+        if len(start_exprs) > 1:
+            from pymbolic.primitives import Max
+            start_expr = Max(start_exprs)
+        elif len(start_exprs) == 1:
+            start_expr, = start_exprs
+        else:
+            raise RuntimeError("no starting value found for 'for' loop in '%s'"
+                    % iname)
+
+        from cgen import For
+        from loopy.codegen import wrap_in
+        return wrap_in(For,
+                "int %s = %s" % (iname, ccm(start_expr)),
+                " && ".join(end_conds),
+                "++%s" % iname,
+                stmt)
 
 # }}}
 
diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py
index f13b501a5171a63bcec91cea9dbb50d2b28b596a..e1f688a051fd3c1456813f05965e6c84fe34191f 100644
--- a/loopy/codegen/dispatch.py
+++ b/loopy/codegen/dispatch.py
@@ -6,10 +6,10 @@ from loopy.codegen import ExecutionDomain, gen_code_block
 
 
 
-def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=False):
+def build_loop_nest(kernel, sched_index, exec_domain, no_conditional_check=False):
     assert isinstance(exec_domain, ExecutionDomain)
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
 
     from cgen import (POD, Initializer, Assign, Statement as S,
             Line)
@@ -37,7 +37,7 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=
 
             exec_domain = exec_domain.intersect(exec_domain_restriction)
 
-            inner = build_loop_nest(cgs, kernel, sched_index, exec_domain,
+            inner = build_loop_nest(kernel, sched_index, exec_domain,
                     no_conditional_check=True)
 
             from loopy.codegen import wrap_in_if
@@ -57,30 +57,21 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=
 
         valid_index_vars = get_valid_check_vars(kernel, sched_index, allow_ilp=True)
         bounds_check_lists = [
-                generate_bounds_checks_code(ccm, kernel.domain,
-                    valid_index_vars, impl_domain)
-                for assignments, impl_domain in
-                    exec_domain]
+                generate_bounds_checks_code(subd.c_code_mapper, kernel.domain,
+                    valid_index_vars, subd.implemented_domain)
+                for subd in exec_domain.subdomains]
 
         result = []
         for lvalue, expr in kernel.instructions:
-            for i, (assignments, impl_domain) in \
-                    enumerate(exec_domain):
-
-                my_block = []
-                if assignments:
-                    my_block.extend(assignments+[Line()])
-
+            for i, subd in enumerate(exec_domain.subdomains):
                 assert isinstance(lvalue, Subscript)
                 name = lvalue.aggregate.name
 
                 from loopy.codegen import wrap_in_if
-                my_block.append(
-                        wrap_in_if(
+                result.append(wrap_in_if(
                             bounds_check_lists[i],
                             S("tmp_%s_%d += %s"
-                                % (name, i, ccm(expr)))))
-                result.append(gen_code_block(my_block))
+                                % (name, i, subd.c_code_mapper(expr)))))
 
         return gen_code_block(result)
 
@@ -109,42 +100,36 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=
         else:
             func = generate_sequential_loop_dim_code
 
-        return func(cgs, kernel, sched_index, exec_domain)
+        return func(kernel, sched_index, exec_domain)
 
     elif isinstance(sched_item, WriteOutput):
         result = (
                 [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype,
                     "tmp_%s_%d" % (lvalue.aggregate.name, i)), 0)
-                    for i in range(len(exec_domain))
+                    for i in range(len(exec_domain.subdomains))
                     for lvalue, expr in kernel.instructions]
                 +[Line()]
-                +[build_loop_nest(cgs, kernel, sched_index+1, 
+                +[build_loop_nest(kernel, sched_index+1, 
                     exec_domain)]
                 +[Line()])
 
-        for i, (idx_assignments, impl_domain) in \
-                enumerate(exec_domain):
+        for i, subd in enumerate(exec_domain.subdomains):
             for lvalue, expr in kernel.instructions:
-                assignment = Assign(ccm(lvalue), "tmp_%s_%d" % (
+                assignment = Assign(subd.c_code_mapper(lvalue), "tmp_%s_%d" % (
                     lvalue.aggregate.name, i))
 
                 wrapped_assign = wrap_in_bounds_checks(
-                        ccm, kernel.domain,
+                        subd.c_code_mapper, kernel.domain,
                         get_valid_check_vars(kernel, sched_index, allow_ilp=True),
-                        impl_domain, assignment)
-
-                cb = []
-                if idx_assignments:
-                    cb.extend(idx_assignments+[Line()])
-                cb.append(wrapped_assign)
+                        subd.implemented_domain, assignment)
 
-                result.append(gen_code_block(cb))
+                result.append(wrapped_assign)
 
         return gen_code_block(result)
 
     elif isinstance(sched_item, LocalMemoryPrefetch):
         from loopy.codegen.prefetch import generate_prefetch_code
-        return generate_prefetch_code(cgs, kernel, sched_index, 
+        return generate_prefetch_code(kernel, sched_index, 
                 exec_domain)
 
     elif isinstance(sched_item, RegisterPrefetch):
@@ -159,7 +144,7 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=
                     % (agg_name,
                         ccm(sched_item.subscript_expr.index)))),
 
-            build_loop_nest(cgs, kernel, sched_index+1, exec_domain)
+            build_loop_nest(kernel, sched_index+1, exec_domain)
             ])
 
     else:
diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py
index 7c799a84c3b5026ac12a4f5d6e4be5353aa1cf65..888682e5aa76169879c9bc0e1ea1d27b089eac27 100644
--- a/loopy/codegen/loop_dim.py
+++ b/loopy/codegen/loop_dim.py
@@ -32,86 +32,46 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain):
 
 # {{{ conditional-minimizing slab decomposition
 
-def get_slab_decomposition(cgs, kernel, sched_index, exec_domain):
+def get_slab_decomposition(kernel, sched_index, exec_domain):
     from loopy.isl import block_shift_constraint, negate_constraint
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
     space = kernel.space
     iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
 
-    # {{{ attempt slab partition to reduce conditional count
-
     lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname,
             exec_domain.implemented_domain)
 
-    # jostle the constant in {lb,ub}_cns to see if we can get
-    # fewer conditionals in the bulk middle segment
-
-    class TrialRecord(Record):
-        pass
-
-    if (cgs.try_slab_partition
-            and "outer" in iname):
-        trial_cgs = cgs.copy(try_slab_partition=False)
-        trials = []
-
-        for lower_incr, upper_incr in [ (0,0), (0,-1), ]:
-
-            lb_cns = block_shift_constraint(lb_cns_orig, iname, -lower_incr)
-            ub_cns = block_shift_constraint(ub_cns_orig, iname, -upper_incr)
-
-            bulk_slab = (isl.Set.universe(kernel.space)
-                    .add_constraint(lb_cns)
-                    .add_constraint(ub_cns))
-            bulk_exec_domain = exec_domain.intersect(bulk_slab)
-            inner = build_loop_nest(trial_cgs, kernel, sched_index+1,
-                    bulk_exec_domain)
-
-            trials.append((TrialRecord(
-                lower_incr=lower_incr,
-                upper_incr=upper_incr,
-                bulk_slab=bulk_slab),
-                (inner.num_conditionals,
-                    # when all num_conditionals are equal, choose the
-                    # one with the smallest bounds changes
-                    abs(upper_incr)+abs(lower_incr))))
-
-        from pytools import argmin2
-        chosen = argmin2(trials)
-    else:
-        bulk_slab = (isl.Set.universe(kernel.space)
-                .add_constraint(lb_cns_orig)
-                .add_constraint(ub_cns_orig))
-        chosen = TrialRecord(
-                    lower_incr=0,
-                    upper_incr=0,
-                    bulk_slab=bulk_slab)
-
-    # }}}
+    lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0))
 
     # {{{ build slabs
 
     slabs = []
-    if chosen.lower_incr:
+    if lower_incr:
         slabs.append(("initial", isl.Set.universe(kernel.space)
                 .add_constraint(lb_cns_orig)
                 .add_constraint(ub_cns_orig)
                 .add_constraint(
                     negate_constraint(
                         block_shift_constraint(
-                            lb_cns_orig, iname, -chosen.lower_incr)))))
+                            lb_cns_orig, iname, -lower_incr)))))
 
-    slabs.append(("bulk", chosen.bulk_slab))
+    slabs.append(("bulk", 
+        (isl.Set.universe(kernel.space)
+            .add_constraint(
+                block_shift_constraint(lb_cns_orig, iname, -lower_incr))
+            .add_constraint(
+                block_shift_constraint(ub_cns_orig, iname, -upper_incr)))))
 
-    if chosen.upper_incr:
+    if upper_incr:
         slabs.append(("final", isl.Set.universe(kernel.space)
                 .add_constraint(ub_cns_orig)
                 .add_constraint(lb_cns_orig)
                 .add_constraint(
                     negate_constraint(
                         block_shift_constraint(
-                            ub_cns_orig, iname, -chosen.upper_incr)))))
+                            ub_cns_orig, iname, -upper_incr)))))
 
     # }}}
 
@@ -121,13 +81,13 @@ def get_slab_decomposition(cgs, kernel, sched_index, exec_domain):
 
 # {{{ unrolled/ILP loops
 
-def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain):
+def generate_unroll_or_ilp_code(kernel, sched_index, exec_domain):
     from loopy.isl import block_shift_constraint
     from loopy.codegen.bounds import solve_constraint_for_bound
 
     from cgen import (POD, Assign, Line, Statement as S, Initializer, Const)
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
     space = kernel.space
     iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
@@ -162,8 +122,7 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain):
 
         for i, slab in generate_idx_eq_slabs():
             new_exec_domain = exec_domain.intersect(slab)
-            inner = build_loop_nest(cgs, kernel, sched_index+1,
-                    new_exec_domain)
+            inner = build_loop_nest(kernel, sched_index+1, new_exec_domain)
 
             if isinstance(tag, TAG_UNROLL_STATIC):
                 result.extend([
@@ -175,24 +134,25 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain):
         return gen_code_block(result)
 
     elif isinstance(tag, TAG_ILP):
-        new_aaid = []
-        for assignments, implemented_domain in exec_domain:
+        new_subdomains = []
+        for subd in exec_domain.subdomains:
             for i, single_slab in generate_idx_eq_slabs():
-                assignments = assignments + [
-                        Initializer(Const(POD(np.int32, iname)), ccm(lower_bound+i))]
-                new_aaid.append((assignments, 
-                    implemented_domain.intersect(single_slab)))
-
-                assignments = []
+                from loopy.codegen import ExecutionSubdomain
+                new_subdomains.append(
+                        ExecutionSubdomain(
+                            subd.implemented_domain.intersect(single_slab),
+                            subd.c_code_mapper.copy_and_assign(
+                                iname, lower_bound+i)))
 
         overall_slab = (isl.Set.universe(kernel.space)
                 .add_constraint(lower_cns)
                 .add_constraint(upper_cns))
 
-        return build_loop_nest(cgs, kernel, sched_index+1,
+        return build_loop_nest(kernel, sched_index+1,
                 ExecutionDomain(
                     exec_domain.implemented_domain.intersect(overall_slab),
-                    new_aaid))
+                    exec_domain.c_code_mapper,
+                    new_subdomains))
     else:
         assert False, "not supposed to get here"
 
@@ -200,16 +160,16 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain):
 
 # {{{ parallel loop
 
-def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain):
+def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain):
     from loopy.isl import make_slab
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
     space = kernel.space
     iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
 
     lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition(
-            cgs, kernel, sched_index, exec_domain)
+            kernel, sched_index, exec_domain)
 
     # For a parallel loop dimension, the global loop bounds are
     # automatically obeyed--simply because no work items are launched
@@ -242,8 +202,7 @@ def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain):
                 domain=kernel.domain.intersect(slab))
         result.append(
                 add_comment(cmt,
-                    build_loop_nest(cgs, new_kernel, sched_index+1,
-                        exec_domain)))
+                    build_loop_nest(new_kernel, sched_index+1, exec_domain)))
 
     from loopy.codegen import gen_code_block
     return gen_code_block(result, is_alternatives=True)
@@ -252,15 +211,15 @@ def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain):
 
 # {{{ sequential loop
 
-def generate_sequential_loop_dim_code(cgs, kernel, sched_index, exec_domain):
+def generate_sequential_loop_dim_code(kernel, sched_index, exec_domain):
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
     space = kernel.space
     iname = kernel.schedule[sched_index].iname
     tag = kernel.iname_to_tag.get(iname)
 
     lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition(
-            cgs, kernel, sched_index, exec_domain)
+            kernel, sched_index, exec_domain)
 
     result = []
     nums_of_conditionals = []
@@ -271,7 +230,7 @@ def generate_sequential_loop_dim_code(cgs, kernel, sched_index, exec_domain):
             cmt = None
 
         new_exec_domain = exec_domain.intersect(slab)
-        inner = build_loop_nest(cgs, kernel, sched_index+1,
+        inner = build_loop_nest(kernel, sched_index+1,
                 new_exec_domain)
 
         from loopy.codegen.bounds import wrap_in_for_from_constraints
diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py
index 22af66d33b52e22d4d877440a60eb3af2619da59..155927e4b3be4ae1ae6913fd574773e0209c7374 100644
--- a/loopy/codegen/prefetch.py
+++ b/loopy/codegen/prefetch.py
@@ -105,24 +105,21 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map,
         # done, return
         from pymbolic.primitives import Variable, Subscript
 
-        from pymbolic.mapper.stringifier import PREC_NONE
         result = Assign(
                 pf.name + "".join("[%s]" % ccm(dexpr)
                     for dexpr in pf_dim_exprs),
                 no_pf_ccm(
                     Subscript(
                         Variable(pf.input_vector),
-                        substitute(pf.index_expr, iname_subst_map)),
-                    PREC_NONE))
-
-        def my_ccm(expr):
-            return ccm(substitute(expr, iname_subst_map))
+                        substitute(pf.index_expr, iname_subst_map))))
 
         from pymbolic.mapper.dependency import DependencyMapper
         check_vars = [v.name for v in DependencyMapper()(pf.index_expr)]
 
         from loopy.codegen.bounds import wrap_in_bounds_checks
-        return wrap_in_bounds_checks(my_ccm, pf.kernel.domain,
+        return wrap_in_bounds_checks(
+                ccm.copy_and_assign_many(iname_subst_map),
+                pf.kernel.domain,
                 check_vars, implemented_domain, result)
 
     fetch_inames = pf.fetch_dims[fetch_dim_idx]
@@ -280,12 +277,12 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map,
         # }}}
 
 
-def generate_prefetch_code(cgs, kernel, sched_index, exec_domain):
+def generate_prefetch_code(kernel, sched_index, exec_domain):
     implemented_domain = exec_domain.implemented_domain
 
     from cgen import Statement as S, Line, Comment
 
-    ccm = cgs.c_code_mapper
+    ccm = exec_domain.c_code_mapper
 
     scheduled_pf = kernel.schedule[sched_index]
     pf = kernel.prefetch[
@@ -482,7 +479,7 @@ def generate_prefetch_code(cgs, kernel, sched_index, exec_domain):
 
     from loopy.codegen.dispatch import build_loop_nest
     new_block.extend([Line(),
-        build_loop_nest(cgs, kernel, sched_index+1, exec_domain)])
+        build_loop_nest(kernel, sched_index+1, exec_domain)])
 
     return gen_code_block(new_block)
 
diff --git a/loopy/compiled.py b/loopy/compiled.py
index bd72ecdaf2894bf61752181aa7f22dfc80e16077..a2b4210d6105f4c458f19766f8677b887619edb8 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -44,7 +44,7 @@ class CompiledKernel:
 
         from pymbolic import compile
         if size_args is None:
-            self.size_args = [arg.name for arg in kernel.args if isinstance(arg, ScalarArg)]
+            self.size_args = kernel.scalar_loop_args
         else:
             self.size_args = size_args
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index f1c8638382d0b7eab5ada3c3781f00f786b81197..30aa1c515addeb4b01cf075a0a19f2efc67b76a3 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -6,6 +6,8 @@ import numpy as np
 from pytools import Record, memoize_method
 from pymbolic.mapper.dependency import DependencyMapper
 import pyopencl as cl
+import islpy as isl
+from islpy import dim_type
 
 
 
@@ -32,9 +34,17 @@ class ArrayArg:
             raise ValueError("can only specify one of shape and strides")
 
         if strides is not None:
+            if isinstance(strides, str):
+                from pymbolic import parse
+                strides = parse(strides)
+
             strides = tuple(strides)
 
         if shape is not None:
+            if isinstance(shape, str):
+                from pymbolic import parse
+                shape = parse(shape)
+
             from pyopencl.compyte.array import (
                     f_contiguous_strides,
                     c_contiguous_strides)
@@ -67,7 +77,7 @@ class ImageArg:
 
 
 class ScalarArg:
-    def __init__(self, name, dtype, approximately):
+    def __init__(self, name, dtype, approximately=None):
         self.name = name
         self.dtype = np.dtype(dtype)
         self.approximately = approximately
@@ -176,34 +186,57 @@ class LoopKernel(Record):
     :ivar register_prefetch:
     :ivar name:
     :ivar preamble:
+    :ivar assumptions: the initial implemented_domain, captures assumptions
+        on the parameters. (an isl.Set)
+    :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
+        upper_incr) tuples that will be separated out in the execution to generate
+        'bulk' slabs with fewer conditionals.
     """
 
     def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None,
             register_prefetch=None, name="loopy_kernel",
             iname_to_tag={}, is_divisible=lambda dividend, divisor: False,
-            preamble=None):
+            preamble=None, assumptions=None,
+            iname_slab_increments={}):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
         :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag`
         """
-        from pymbolic import parse
 
-        def parse_if_necessary(v):
-            if isinstance(v, str):
-                return parse(v)
-            else:
-                return v
+        def parse_if_necessary(insn):
+            from pymbolic import parse
+            if isinstance(insn, str):
+                lhs, rhs = insn.split("=")
+            elif isinstance(insn, tuple):
+                lhs, rhs = insn
+
+            if isinstance(lhs, str):
+                lhs = parse(lhs)
+            if isinstance(rhs, str):
+                from loopy.symbolic import CSERealizer
+                rhs = CSERealizer()(parse(rhs))
+
+            return lhs, rhs
 
         if isinstance(domain, str):
-            import islpy as isl
             ctx = isl.Context()
             domain = isl.Set.read_from_str(ctx, domain, nparam=-1)
 
-        insns = [
-                (parse_if_necessary(lvalue),
-                    parse_if_necessary(expr))
-                for lvalue, expr in instructions]
+        insns = [parse_if_necessary(insn) for insn in instructions]
+
+        if assumptions is None:
+            assumptions = isl.Set.universe(domain.get_dim())
+        elif isinstance(assumptions, str):
+            s = domain.get_dim()
+            assumptions = isl.BasicSet.read_from_str(domain.get_ctx(),
+                    "[%s] -> {[%s]: %s}"
+                    % (",".join(s.get_name(dim_type.param, i)
+                        for i in range(s.size(dim_type.param))),
+                       ",".join(s.get_name(dim_type.set, i) 
+                           for i in range(s.size(dim_type.set))),
+                       assumptions),
+                       nparam=-1)
 
         Record.__init__(self,
                 device=device, args=args, domain=domain, instructions=insns,
@@ -213,7 +246,9 @@ class LoopKernel(Record):
                     (iname, parse_tag(tag))
                     for iname, tag in iname_to_tag.iteritems()),
                 is_divisible=is_divisible,
-                preamble=preamble)
+                preamble=preamble,
+                assumptions=assumptions,
+                iname_slab_increments=iname_slab_increments)
 
         # FIXME: assert empty intersection of loop vars and args
         # FIXME: assert that isl knows about loop parameters
@@ -236,12 +271,16 @@ class LoopKernel(Record):
     def arg_dict(self):
         return dict((arg.name, arg) for arg in self.args)
 
+    @property
     @memoize_method
-    def scalar_args(self):
+    def scalar_loop_args(self):
         if self.args is None:
-            return set()
+            return []
         else:
-            return set(arg.name for arg in self.args if isinstance(arg, ScalarArg))
+            loop_arg_names = [self.space.get_name(dim_type.param, i)
+                    for i in range(self.space.size(dim_type.param))]
+            return [arg.name for arg in self.args if isinstance(arg, ScalarArg)
+                    if arg.name in loop_arg_names]
 
     @memoize_method
     def all_inames(self):
@@ -332,7 +371,7 @@ class LoopKernel(Record):
             input_vectors.update(
                     set(v.name for v in dm(expr)))
 
-        return input_vectors - self.all_inames() - self.scalar_args()
+        return input_vectors - self.all_inames() - set(self.scalar_loop_args)
 
     @memoize_method
     def output_vectors(self):
@@ -343,7 +382,7 @@ class LoopKernel(Record):
             output_vectors.update(
                     set(v.name for v in dm(lvalue)))
 
-        return output_vectors - self.all_inames() - self.scalar_args()
+        return output_vectors - self.all_inames() - self.scalar_loop_args
 
     def _subst_insns(self, old_var, new_expr):
         from pymbolic.mapper.substitutor import substitute
@@ -393,7 +432,8 @@ class LoopKernel(Record):
 
     def split_dimension(self, name, inner_length, padded_length=None,
             outer_name=None, inner_name=None,
-            outer_tag=None, inner_tag=None):
+            outer_tag=None, inner_tag=None,
+            outer_slab_increments=(0, -1)):
 
         outer_tag = parse_tag(outer_tag)
         inner_tag = parse_tag(inner_tag)
@@ -434,33 +474,41 @@ class LoopKernel(Record):
         from islpy import dim_type
         outer_var_nr = self.space.size(dim_type.set)
         inner_var_nr = self.space.size(dim_type.set)+1
-        new_domain = self.domain.add_dims(dim_type.set, 2)
-        new_domain.set_dim_name(dim_type.set, outer_var_nr, outer_name)
-        new_domain.set_dim_name(dim_type.set, inner_var_nr, inner_name)
 
-        import islpy as isl
-        from loopy.isl import make_slab
+        def process_set(s):
+            s = s.add_dims(dim_type.set, 2)
+            s.set_dim_name(dim_type.set, outer_var_nr, outer_name)
+            s.set_dim_name(dim_type.set, inner_var_nr, inner_name)
+
+            from loopy.isl import make_slab
+
+            space = s.get_dim()
+            inner_constraint_set = (
+                    make_slab(space, inner_name, 0, inner_length)
+                    # name = inner + length*outer
+                    .add_constraint(isl.Constraint.eq_from_names(
+                        space, {name:1, inner_name: -1, outer_name:-inner_length})))
 
-        space = new_domain.get_dim()
-        inner_constraint_set = (
-                make_slab(space, inner_name, 0, inner_length)
-                # name = inner + length*outer
-                .add_constraint(isl.Constraint.eq_from_names(
-                    space, {name:1, inner_name: -1, outer_name:-inner_length})))
+            name_dim_type, name_idx = space.get_var_dict()[name]
+            return (s
+                    .intersect(inner_constraint_set)
+                    .project_out(name_dim_type, name_idx, 1))
 
-        name_dim_type, name_idx = space.get_var_dict()[name]
-        new_domain = (new_domain
-                .intersect(inner_constraint_set)
-                .project_out(name_dim_type, name_idx, 1))
+        new_domain = process_set(self.domain) 
+        new_assumptions = process_set(self.assumptions)
 
         from pymbolic import var
         inner = var(inner_name)
         outer = var(outer_name)
         new_loop_index = inner + outer*inner_length
 
+        iname_slab_increments = self.iname_slab_increments.copy()
+        iname_slab_increments[outer_name] = outer_slab_increments
         return (self
                 .substitute(name, new_loop_index)
-                .copy(domain=new_domain, iname_to_tag=new_iname_to_tag))
+                .copy(domain=new_domain, iname_to_tag=new_iname_to_tag,
+                    assumptions=new_assumptions,
+                    iname_slab_increments=iname_slab_increments))
 
     def get_problems(self, parameters, emit_warnings=True):
         """
diff --git a/loopy/prefetch.py b/loopy/prefetch.py
index 49330236cad088a428b5e2837512daf4d98481b4..98aeed94806fdf73b18dd1e20d5ab81c83bbca8e 100644
--- a/loopy/prefetch.py
+++ b/loopy/prefetch.py
@@ -9,7 +9,7 @@ from islpy import dim_type
 # {{{ register prefetches
 
 class RegisterPrefetch(Record):
-    __slots__ = ["subscript_expr", "new_name"]
+    __slots__ = ["subexprs", "new_names"]
 
 def insert_register_prefetches(kernel):
     reg_pf = {}
@@ -54,7 +54,7 @@ def insert_register_prefetches(kernel):
                     reg_pf[iexpr] = new_name
                     schedule.insert(sched_index,
                             RegisterPrefetch(
-                                index_expr=iexpr, new_name=new_name))
+                                subexprs=[iexpr], new_names=[new_name]))
                     sched_index += 1
                 else:
                     i += 1
@@ -178,7 +178,7 @@ class LocalMemoryPrefetch(Record):
         from pymbolic.mapper.dependency import DependencyMapper
         return set(var.name
                 for var in DependencyMapper()(self.index_expr)
-                ) - set(self.all_inames()) - self.kernel.scalar_args()
+                ) - set(self.all_inames()) - set(self.kernel.scalar_loop_args)
 
     def hash(self):
         return (hash(type(self)) ^ hash(self.input_vector)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 2e2587f1dc184b508e049079b820ee337ca4b37a..b928d4b31bb280d68bf7d822b7e1bde2cd1b352d 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -2,7 +2,7 @@
 
 from __future__ import division
 
-from pymbolic.mapper import CombineMapper, RecursiveMapper
+from pymbolic.mapper import CombineMapper, RecursiveMapper, IdentityMapper
 from pymbolic.mapper.c_code import CCodeMapper
 from pymbolic.mapper.stringifier import PREC_NONE
 import numpy as np
@@ -120,7 +120,8 @@ class VariableIndexExpressionCollector(CombineMapper):
 # {{{ C code mapper
 
 class LoopyCCodeMapper(CCodeMapper):
-    def __init__(self, kernel, no_prefetch=False):
+    def __init__(self, kernel, no_prefetch=False, cse_name_list=[],
+            var_subst_map={}):
         def constant_mapper(c):
             if isinstance(c, float):
                 # FIXME: type-variable
@@ -128,11 +129,39 @@ class LoopyCCodeMapper(CCodeMapper):
             else:
                 return repr(c)
 
-        CCodeMapper.__init__(self, constant_mapper=constant_mapper)
+        CCodeMapper.__init__(self, constant_mapper=constant_mapper,
+                cse_name_list=cse_name_list)
         self.kernel = kernel
 
+        self.var_subst_map = var_subst_map.copy()
+
         self.no_prefetch = no_prefetch
 
+    def copy(self, var_subst_map=None, cse_name_list=None):
+        if var_subst_map is None:
+            var_subst_map = self.var_subst_map
+        if cse_name_list is None:
+            cse_name_list = self.cse_name_list
+        return LoopyCCodeMapper(self.kernel, no_prefetch=self.no_prefetch,
+                cse_name_list=cse_name_list, var_subst_map=var_subst_map)
+
+    def copy_and_assign(self, name, value):
+        var_subst_map = self.var_subst_map.copy()
+        var_subst_map[name] = value
+        return self.copy(var_subst_map=var_subst_map)
+
+    def copy_and_assign_many(self, assignments):
+        var_subst_map = self.var_subst_map.copy()
+        var_subst_map.update(assignments)
+        return self.copy(var_subst_map=var_subst_map)
+
+    def map_variable(self, expr, prec):
+        if expr.name in self.var_subst_map:
+            return " /* %s */ %s" % (
+                    expr.name, self.rec(self.var_subst_map[expr.name], prec))
+        else:
+            return CCodeMapper.map_variable(self, expr, prec)
+
     def map_subscript(self, expr, enclosing_prec):
         from pymbolic.primitives import Variable
         if (not self.no_prefetch
@@ -143,11 +172,11 @@ class LoopyCCodeMapper(CCodeMapper):
             except KeyError:
                 pass
             else:
-                from pymbolic.mapper.stringifier import PREC_SUM
+                from pymbolic import var
                 return pf.name+"".join(
-                        "[%s - %s]" % (iname, self.rec(
-                            pf.dim_bounds_by_iname[iname][0],
-                            PREC_SUM))
+                        "[%s]" % self.rec(
+                            var(iname) - pf.dim_bounds_by_iname[iname][0],
+                            PREC_NONE)
                         for iname in pf.all_inames())
 
         if isinstance(expr.aggregate, Variable):
@@ -252,6 +281,26 @@ def constraint_to_expr(cns, except_name=None):
 
 # }}}
 
+# {{{ CSE "realizer"
+
+class CSERealizer(IdentityMapper):
+    """Looks for invocations of a function called 'cse' and turns those into
+    CommonSubexpression objects.
+    """
+
+    def map_call(self, expr):
+        from pymbolic import Variable
+        if isinstance(expr.function, Variable) and expr.function.name == "cse":
+            from pymbolic.primitives import CommonSubexpression
+            if len(expr.parameters) == 1:
+                return CommonSubexpression(expr.parameters[0])
+            else:
+                raise TypeError("cse takes exactly one argument")
+        else:
+            return IdentityMapper.map_call(self, expr)
+
+# }}}
+
 
 
 
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 7aa72e646ac2aa800e390ee32245e103b25d24e4..c512640c42c83ebaac34166154afab461c30912f 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -2,6 +2,7 @@ import numpy as np
 import numpy.linalg as la
 import pyopencl as cl
 import pyopencl.array as cl_array
+import pyopencl.clrandom as cl_random
 import loopy as lp
 
 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -47,7 +48,12 @@ def check_error(refsol, sol):
     if not DO_CHECK:
         return
 
-    rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro")
+    if sol.shape == 2:
+        norm_order = "fro"
+    else:
+        norm_order = 2
+
+    rel_err = la.norm(refsol-sol, norm_order)/la.norm(refsol, norm_order)
     if rel_err > 1e-5 or np.isinf(rel_err) or np.isnan(rel_err):
         if 1:
             import matplotlib.pyplot as pt
@@ -86,22 +92,20 @@ def test_axpy(ctx_factory):
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = get_suitable_size(ctx)
-    from pymbolic import var
-    x, y, z, n_sym, i = [var(s) for s in "xyzni"]
-
-    n_approx = 10**6
+    n = get_suitable_size(ctx)**3
 
     knl = lp.LoopKernel(ctx.devices[0],
             "[n] -> {[i]: 0<=i<n}",
             [
-                (z[i], x[i]+y[i]) # FIXME: Add scalars
+                "z[i] = a*x[i]+b*y[i]"
                 ],
             [
-                lp.ArrayArg("x", dtype, shape=(n,)),
-                lp.ArrayArg("y", dtype, shape=(n,)),
-                lp.ArrayArg("z", dtype, shape=(n,)),
-                lp.ScalarArg("n", np.int32, approximately=n_approx),
+                lp.ScalarArg("a", dtype),
+                lp.ArrayArg("x", dtype, shape="n,"),
+                lp.ScalarArg("b", dtype),
+                lp.ArrayArg("y", dtype, shape="n,"),
+                lp.ArrayArg("z", dtype, shape="n,"),
+                lp.ScalarArg("n", np.int32, approximately=n),
                 ],
             name="matmul")
 
@@ -109,27 +113,26 @@ def test_axpy(ctx_factory):
     block_size = 256
     knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0")
     knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0")
-    assert knl.get_problems({"n": n_approx})[0] <= 2
+    assert knl.get_problems({"n": n})[0] <= 2
 
     kernel_gen = (lp.insert_register_prefetches(knl)
             for knl in lp.generate_loop_schedules(knl))
 
-    #a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    #b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    #c = cl_array.empty_like(a)
-    #refsol = np.dot(a.get(), b.get())
+    a = cl_random.rand(queue, n, dtype=dtype)
+    b = cl_random.rand(queue, n, dtype=dtype)
+    c = cl_array.empty_like(a)
+    refsol = (2*a+3*b).get()
 
     def launcher(kernel, gsize, lsize, check):
-        #evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data,
-                #g_times_l=True)
+        evt = kernel(queue, gsize(n), lsize(n), 2, a.data, 3, b.data, c.data, n,
+                g_times_l=True)
 
-        #if check:
-            #check_error(refsol, c.get())
+        if check:
+            check_error(refsol, c.get())
 
-        #return evt
-        pass
+        return evt
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+    lp.drive_timing_run(kernel_gen, queue, launcher, 5*n)
 
 
 
@@ -143,13 +146,11 @@ def test_plain_matrix_mul(ctx_factory):
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = get_suitable_size(ctx)
-    from pymbolic import var
-    a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
     knl = lp.LoopKernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                (c[i, j], a[i, k]*b[k, j])
+                "c[i, j] = a[i, k]*b[k, j]"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),
@@ -196,13 +197,11 @@ def test_image_matrix_mul(ctx_factory):
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = get_suitable_size(ctx)
-    from pymbolic import var
-    a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
     knl = lp.LoopKernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                (c[i, j], a[i, k]*b[k, j])
+                "c[i, j] = a[i, k]*b[k, j]"
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
@@ -251,13 +250,11 @@ def test_image_matrix_mul_ilp(ctx_factory):
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = get_suitable_size(ctx)
-    from pymbolic import var
-    a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
     knl = lp.LoopKernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                (c[i, j], a[i, k]*b[k, j])
+                "c[i, j] = a[i, k]*b[k, j]"
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
@@ -312,18 +309,16 @@ def test_fancy_matrix_mul(ctx_factory):
     order = "C"
 
     n = get_suitable_size(ctx)
-    from pymbolic import var
-    a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"]
 
     knl = lp.LoopKernel(ctx.devices[0],
             "[n] -> {[i,j,k]: 0<=i,j,k<n }",
             [
-                (c[i, j], a[i, k]*b[k, j])
+                "c[i, j] = a[i, k]*b[k, j]"
                 ],
             [
-                lp.ArrayArg("a", dtype, shape=(n_sym, n_sym), order=order),
-                lp.ArrayArg("b", dtype, shape=(n_sym, n_sym), order=order),
-                lp.ArrayArg("c", dtype, shape=(n_sym, n_sym), order=order),
+                lp.ArrayArg("a", dtype, shape="(n, n)", order=order),
+                lp.ArrayArg("b", dtype, shape="(n, n)", order=order),
+                lp.ArrayArg("c", dtype, shape="(n, n)", order=order),
                 lp.ScalarArg("n", np.int32, approximately=1000),
                 ], name="fancy_matmul")