From f05a6a827a8ba5cfff03248e9f1cc803b85429a0 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 21 Jun 2013 23:22:51 -0400
Subject: [PATCH] Introduce CInstruction, fix up rest of loopy to deal with it

---
 doc/reference.rst            |  24 ++-
 loopy/__init__.py            |  12 +-
 loopy/check.py               | 169 +++++++++++----------
 loopy/codegen/__init__.py    |   3 +
 loopy/codegen/instruction.py |  39 ++++-
 loopy/diagnostic.py          |   4 +
 loopy/kernel/__init__.py     |  23 ++-
 loopy/kernel/creation.py     | 137 ++++++++++-------
 loopy/kernel/data.py         | 282 ++++++++++++++++++++++++++++++-----
 loopy/kernel/tools.py        |   7 +
 loopy/precompute.py          |  10 +-
 loopy/preprocess.py          | 109 +++++++++-----
 loopy/schedule.py            |   8 +-
 loopy/subst.py               |  12 +-
 loopy/symbolic.py            |  14 +-
 test/test_dg.py              |   4 +
 test/test_loopy.py           |  28 +++-
 17 files changed, 635 insertions(+), 250 deletions(-)

diff --git a/doc/reference.rst b/doc/reference.rst
index bf62ec89e..b415e40af 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -135,14 +135,28 @@ Arguments
     :members:
     :undoc-members:
 
-.. _syntax:
+Temporary Variables
+^^^^^^^^^^^^^^^^^^^
 
-String Syntax
-^^^^^^^^^^^^^
+.. autoclass:: TemporaryVariable
+    :members:
+    :undoc-members:
+
+Substitution rules
+^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: SubstitutionRule
+
+String sytnax: FIXME
+
+Instructions
+^^^^^^^^^^^^
+
+.. autoclass:: ExpressionInstruction
 
-* Substitution rules
+.. autoclass:: CInstruction
 
-* Instructions
+String sytnax: FIXME
 
 Kernels
 ^^^^^^^
diff --git a/loopy/__init__.py b/loopy/__init__.py
index dc4e7bf32..b15f447d0 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -50,8 +50,10 @@ from loopy.library.preamble import default_preamble_generator
 from loopy.library.symbol import opencl_symbol_mangler
 
 from loopy.kernel.data import (
+        auto,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
-        ExpressionInstruction, CInstruction)
+        ExpressionInstruction, CInstruction,
+        TemporaryVariable)
 
 from loopy.kernel import LoopKernel
 from loopy.kernel.tools import (
@@ -76,6 +78,8 @@ __all__ = [
         "LoopKernel",
 
         "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg",
+        "TemporaryVariable",
+
         "ExpressionInstruction", "CInstruction",
 
         "default_function_mangler", "single_arg_function_mangler",
@@ -104,12 +108,6 @@ __all__ = [
         ]
 
 
-class auto:
-    """A generic placeholder object for something that should be automatically
-    detected.  See, for example, the *shape* or *strides* argument of
-    :class:`GlobalArg`.
-    """
-
 # }}}
 
 
diff --git a/loopy/check.py b/loopy/check.py
index b48b39a8b..a871508b9 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 from islpy import dim_type
 import islpy as isl
 from loopy.symbolic import WalkMapper
+from loopy.diagnostic import LoopyError, LoopyWarning
 
 import logging
 logger = logging.getLogger(__name__)
@@ -35,54 +36,6 @@ from loopy.diagnostic import WriteRaceConditionError
 
 # {{{ sanity checks run during scheduling
 
-def check_sizes(kernel):
-    import loopy as lp
-
-    from loopy.diagnostic import LoopyAdvisory
-
-    parameters = {}
-    for arg in kernel.args:
-        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
-            parameters[arg.name] = arg.approximately
-
-    glens, llens = kernel.get_grid_sizes_as_exprs()
-
-    if (max(len(glens), len(llens))
-            > kernel.device.max_work_item_dimensions):
-        raise RuntimeError("too many work item dimensions")
-
-    from pymbolic import evaluate
-    from pymbolic.mapper.evaluator import UnknownVariableError
-    try:
-        glens = evaluate(glens, parameters)
-        llens = evaluate(llens, parameters)
-    except UnknownVariableError, name:
-        from warnings import warn
-        warn("could not check axis bounds because no value "
-                "for variable '%s' was passed to check_kernels()"
-                % name, LoopyAdvisory)
-    else:
-        for i in range(len(llens)):
-            if llens[i] > kernel.device.max_work_item_sizes[i]:
-                raise RuntimeError("group axis %d too big" % i)
-
-        from pytools import product
-        if product(llens) > kernel.device.max_work_group_size:
-            raise RuntimeError("work group too big")
-
-    from pyopencl.characterize import usable_local_mem_size
-    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
-        raise RuntimeError(5, "using too much local memory")
-
-    from loopy.kernel.data import ConstantArg
-    const_arg_count = sum(
-            1 for arg in kernel.args
-            if isinstance(arg, ConstantArg))
-
-    if const_arg_count > kernel.device.max_constant_args:
-        raise RuntimeError("too many constant arguments")
-
-
 def check_for_unused_hw_axes_in_insns(kernel):
     group_size, local_size = kernel.get_grid_sizes_as_exprs()
 
@@ -107,16 +60,16 @@ def check_for_unused_hw_axes_in_insns(kernel):
             elif isinstance(tag, GroupIndexTag):
                 group_axes_used.add(tag.axis)
             elif isinstance(tag, AutoLocalIndexTagBase):
-                raise RuntimeError("auto local tag encountered")
+                raise LoopyError("auto local tag encountered")
 
         if group_axes != group_axes_used:
-            raise RuntimeError("instruction '%s' does not use all group hw axes "
+            raise LoopyError("instruction '%s' does not use all group hw axes "
                     "(available: %s used:%s)"
                     % (insn.id,
                         ",".join(str(i) for i in group_axes),
                         ",".join(str(i) for i in group_axes_used)))
         if local_axes != local_axes_used:
-            raise RuntimeError("instruction '%s' does not use all local hw axes"
+            raise LoopyError("instruction '%s' does not use all local hw axes"
                     "(available: %s used:%s)"
                     % (insn.id,
                         ",".join(str(i) for i in local_axes),
@@ -133,22 +86,18 @@ def check_for_double_use_of_hw_axes(kernel):
             if isinstance(tag, UniqueTag):
                 key = tag.key
                 if key in insn_tag_keys:
-                    raise RuntimeError("instruction '%s' has multiple "
+                    raise LoopyError("instruction '%s' has multiple "
                             "inames tagged '%s'" % (insn.id, tag))
 
                 insn_tag_keys.add(key)
 
 
 def check_for_inactive_iname_access(kernel):
-    from loopy.symbolic import DependencyMapper
-    depmap = DependencyMapper()
-
     for insn in kernel.instructions:
-        expression_indices = depmap(insn.expression)
-        expression_inames = expression_indices & kernel.all_inames()
+        expression_inames = insn.read_dependency_names() & kernel.all_inames()
 
         if not expression_inames <= kernel.insn_inames(insn):
-            raise RuntimeError(
+            raise LoopyError(
                     "instructiosn '%s' references "
                     "inames that the instruction does not depend on"
                     % insn.id)
@@ -173,7 +122,7 @@ def check_for_write_races(kernel):
 
             assignee_inames = assignee_indices & kernel.all_inames()
             if not assignee_inames <= kernel.insn_inames(insn):
-                raise RuntimeError(
+                raise LoopyError(
                         "assignee of instructiosn '%s' references "
                         "iname that the instruction does not depend on"
                         % insn.id)
@@ -207,11 +156,11 @@ def check_for_write_races(kernel):
                                 LocalIndexTagBase))
 
                 else:
-                    raise RuntimeError("temp var '%s' hasn't decided on "
+                    raise LoopyError("temp var '%s' hasn't decided on "
                             "whether it is local" % temp_var.name)
 
             else:
-                raise RuntimeError("invalid assignee name in instruction '%s'"
+                raise LoopyError("invalid assignee name in instruction '%s'"
                         % insn.id)
 
             race_inames = \
@@ -235,7 +184,7 @@ def check_for_orphaned_user_hardware_axes(kernel):
                 break
 
         if not found:
-            raise RuntimeError("user-requested local hardware axis %d "
+            raise LoopyError("user-requested local hardware axis %d "
                     "has no iname mapped to it" % axis)
 
 
@@ -254,7 +203,7 @@ def check_for_data_dependent_parallel_bounds(kernel):
         parameters = set(dom.get_var_names(dim_type.param))
         for par in parameters:
             if par in kernel.temporary_variables:
-                raise RuntimeError("Domain number %d has a data-dependent "
+                raise LoopyError("Domain number %d has a data-dependent "
                         "parameter '%s' and contains parallel "
                         "inames '%s'. This is not allowed (for now)."
                         % (i, par, ", ".join(par_inames)))
@@ -295,7 +244,7 @@ class _AccessCheckMapper(WalkMapper):
                 return
 
             if len(subscript) != len(shape):
-                raise RuntimeError("subscript to '%s' in '%s' has the wrong "
+                raise LoopyError("subscript to '%s' in '%s' has the wrong "
                         "number of indices (got: %d, expected: %d)" % (
                             expr.aggregate.name, expr,
                             len(subscript), len(shape)))
@@ -316,7 +265,7 @@ class _AccessCheckMapper(WalkMapper):
                 shape_domain = shape_domain.intersect(slab)
 
             if not access_range.is_subset(shape_domain):
-                raise RuntimeError("'%s' in instruction '%s' "
+                raise LoopyError("'%s' in instruction '%s' "
                         "accesses out-of-bounds array element"
                         % (expr, self.insn_id))
 
@@ -331,40 +280,38 @@ def check_bounds(kernel):
             continue
 
         acm = _AccessCheckMapper(kernel, domain, insn.id)
-        acm(insn.expression)
-        acm(insn.assignee)
+        insn.with_transformed_expressions(acm)
 
 
 def check_write_destinations(kernel):
     for insn in kernel.instructions:
         for wvar, _ in insn.assignees_and_indices():
             if wvar in kernel.all_inames():
-                raise RuntimeError("iname '%s' may not be written" % wvar)
+                raise LoopyError("iname '%s' may not be written" % wvar)
 
             insn_domain = kernel.get_inames_domain(kernel.insn_inames(insn))
             insn_params = set(insn_domain.get_var_names(dim_type.param))
 
             if wvar in kernel.all_params():
                 if wvar not in kernel.temporary_variables:
-                    raise RuntimeError("domain parameter '%s' may not be written"
+                    raise LoopyError("domain parameter '%s' may not be written"
                             "--it is not a temporary variable" % wvar)
 
                 if wvar in insn_params:
-                    raise RuntimeError("domain parameter '%s' may not be written "
+                    raise LoopyError("domain parameter '%s' may not be written "
                             "inside a domain dependent on it" % wvar)
 
             if not (wvar in kernel.temporary_variables
                     or wvar in kernel.arg_dict) and wvar not in kernel.all_params():
-                raise RuntimeError
+                raise LoopyError
 
 # }}}
 
 
-def run_automatic_checks(kernel):
+def pre_schedule_checks(kernel):
     try:
-        logger.info("sanity-check %s: start" % kernel.name)
+        logger.info("pre-schedule check %s: start" % kernel.name)
 
-        check_sizes(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel)
         check_for_unused_hw_axes_in_insns(kernel)
@@ -374,16 +321,84 @@ def run_automatic_checks(kernel):
         check_bounds(kernel)
         check_write_destinations(kernel)
 
-        logger.info("sanity-check %s: done" % kernel.name)
+        logger.info("pre-schedule check %s: done" % kernel.name)
     except:
         print 75*"="
-        print "failing kernel after processing:"
+        print "failing kernel during pre-schedule check:"
         print 75*"="
         print kernel
         print 75*"="
         raise
 
 
+# {{{ pre-code-generation checks
+
+def check_sizes(kernel):
+    import loopy as lp
+
+    from loopy.diagnostic import LoopyAdvisory
+
+    parameters = {}
+    for arg in kernel.args:
+        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
+            parameters[arg.name] = arg.approximately
+
+    glens, llens = kernel.get_grid_sizes_as_exprs()
+
+    if (max(len(glens), len(llens))
+            > kernel.device.max_work_item_dimensions):
+        raise LoopyError("too many work item dimensions")
+
+    from pymbolic import evaluate
+    from pymbolic.mapper.evaluator import UnknownVariableError
+    try:
+        glens = evaluate(glens, parameters)
+        llens = evaluate(llens, parameters)
+    except UnknownVariableError, name:
+        from warnings import warn
+        warn("could not check axis bounds because no value "
+                "for variable '%s' was passed to check_kernels()"
+                % name, LoopyAdvisory)
+    else:
+        for i in range(len(llens)):
+            if llens[i] > kernel.device.max_work_item_sizes[i]:
+                raise LoopyError("group axis %d too big" % i)
+
+        from pytools import product
+        if product(llens) > kernel.device.max_work_group_size:
+            raise LoopyError("work group too big")
+
+    from pyopencl.characterize import usable_local_mem_size
+    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
+        raise LoopyError(5, "using too much local memory")
+
+    from loopy.kernel.data import ConstantArg
+    const_arg_count = sum(
+            1 for arg in kernel.args
+            if isinstance(arg, ConstantArg))
+
+    if const_arg_count > kernel.device.max_constant_args:
+        raise LoopyError("too many constant arguments")
+
+
+def pre_codegen_checks(kernel):
+    try:
+        logger.info("pre-codegen check %s: start" % kernel.name)
+
+        check_sizes(kernel)
+
+        logger.info("pre-codegen check %s: done" % kernel.name)
+    except:
+        print 75*"="
+        print "failing kernel during pre-schedule check:"
+        print 75*"="
+        print kernel
+        print 75*"="
+        raise
+
+# }}}
+
+
 # {{{ sanity-check for implemented domains of each instruction
 
 def check_implemented_domains(kernel, implemented_domains, code=None):
@@ -453,7 +468,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 print get_highlighted_cl_code(code)
                 print 79*"-"
 
-            raise RuntimeError("sanity check failed--implemented and desired "
+            raise LoopyError("sanity check failed--implemented and desired "
                     "domain for instruction '%s' do not match\n\n"
                     "implemented: %s\n\n"
                     "desired:%s\n\n%s"
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index b61174c60..8b4e0d22f 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -258,6 +258,9 @@ def generate_code(kernel, with_annotation=False,
     from loopy.preprocess import infer_unknown_types
     kernel = infer_unknown_types(kernel, expect_completion=True)
 
+    from loopy.check import pre_codegen_checks
+    pre_codegen_checks(kernel)
+
     from cgen import (FunctionBody, FunctionDeclaration,
             Value, Module, Block,
             Line, Const, LiteralLines, Initializer)
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 5015d6234..f333baadb 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 
 import islpy as isl
+from loopy.codegen import GeneratedInstruction
 
 
 def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt):
@@ -60,7 +61,6 @@ def generate_instruction_code(kernel, insn, codegen_state):
 
 
 def generate_expr_instruction_code(kernel, insn, codegen_state):
-    from loopy.codegen import GeneratedInstruction
 
     ccm = codegen_state.c_code_mapper
 
@@ -108,6 +108,41 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
 
 
 def generate_c_instruction_code(kernel, insn, codegen_state):
-    raise NotImplementedError
+    ccm = codegen_state.c_code_mapper
+
+    body = []
+
+    from loopy.codegen import POD
+    from cgen import Initializer, Block, Line
+
+    from pymbolic.primitives import Variable
+    for name, iname_expr in insn.iname_exprs:
+        if (isinstance(iname_expr, Variable)
+                and name not in ccm.var_subst_map):
+            # No need, the bare symbol will work
+            continue
+
+        body.append(
+                Initializer(
+                    POD(kernel.index_dtype, name),
+                    codegen_state.c_code_mapper(
+                        iname_expr, prec=None, type_context="i")))
+
+    if body:
+        body.append(Line())
+
+    body.extend(Line(l) for l in insn.code.split("\n"))
+
+    insn_inames = kernel.insn_inames(insn)
+    insn_code, impl_domain = wrap_in_bounds_checks(
+            ccm, kernel.get_inames_domain(insn_inames), insn_inames,
+            codegen_state.implemented_domain,
+            Block(body))
+
+    return GeneratedInstruction(
+        insn_id=insn.id,
+        implemented_domain=impl_domain,
+        ast=insn_code)
+
 
 # vim: foldmethod=marker
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index d3ed8770f..dd2860a0e 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -29,6 +29,10 @@ class LoopyWarningBase(UserWarning):
     pass
 
 
+class LoopyWarning(LoopyWarningBase):
+    pass
+
+
 class LoopyAdvisory(LoopyWarningBase):
     pass
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 68dac0dae..8a7c53f84 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -860,7 +860,22 @@ class LoopKernel(Record):
         lines.append(sep)
         lines.append("INSTRUCTIONS:")
         loop_list_width = 35
+
+        import loopy as lp
         for insn in self.instructions:
+            if isinstance(insn, lp.ExpressionInstruction):
+                lhs = str(insn.assignee)
+                rhs = str(insn.expression)
+                trailing = []
+            elif isinstance(insn, lp.CInstruction):
+                lhs = ", ".join(str(a) for a in insn.assignees)
+                rhs = "CODE(%s|%s)" % (
+                        ", ".join(str(x) for x in insn.read_variables),
+                        ", ".join("%s=%s" % (name, expr)
+                            for name, expr in insn.iname_exprs))
+
+                trailing = ["    "+l for l in insn.code.split("\n")]
+
             loop_list = ",".join(sorted(self.insn_inames(insn)))
 
             options = [insn.id]
@@ -870,12 +885,14 @@ class LoopKernel(Record):
             if len(loop_list) > loop_list_width:
                 lines.append("[%s]" % loop_list)
                 lines.append("%s%s <- %s   # %s" % (
-                    (loop_list_width+2)*" ", insn.assignee,
-                    insn.expression, ", ".join(options)))
+                    (loop_list_width+2)*" ", lhs,
+                    rhs, ", ".join(options)))
             else:
                 lines.append("[%s]%s%s <- %s   # %s" % (
                     loop_list, " "*(loop_list_width-len(loop_list)),
-                    insn.assignee, insn.expression, ", ".join(options)))
+                    lhs, rhs, ", ".join(options)))
+
+            lines.extend(trailing)
 
         lines.append(sep)
         lines.append("DEPENDENCIES:")
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f4cd45c27..157b7cbf3 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -34,6 +34,9 @@ from islpy import dim_type
 
 import re
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 # {{{ identifier wrangling
 
@@ -477,13 +480,8 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables,
 def tag_reduction_inames_as_sequential(knl):
     result = set()
 
-    def map_reduction(red_expr, rec):
-        rec(red_expr.expr)
-        result.update(red_expr.inames)
-
-    from loopy.symbolic import ReductionCallbackMapper
     for insn in knl.instructions:
-        ReductionCallbackMapper(map_reduction)(insn.expression)
+        result.update(insn.reduction_inames())
 
     from loopy.kernel.data import ParallelTag, ForceSequentialTag
 
@@ -615,7 +613,7 @@ def expand_cses(knl):
         new_temp_vars[new_var_name] = TemporaryVariable(
                 name=new_var_name,
                 dtype=dtype,
-                is_local=None,
+                is_local=lp.auto,
                 shape=())
 
         from pymbolic.primitives import Variable
@@ -656,29 +654,13 @@ def create_temporaries(knl):
     new_insns = []
     new_temp_vars = knl.temporary_variables.copy()
 
-    from loopy.symbolic import AccessRangeMapper
+    import loopy as lp
 
     for insn in knl.instructions:
-        if not isinstance(insn, ExpressionInstruction):
-            continue
-
-        from loopy.kernel.data import TemporaryVariable
-
-        if insn.temp_var_type is not None:
+        if isinstance(insn, ExpressionInstruction) \
+                and insn.temp_var_type is not None:
             (assignee_name, _), = insn.assignees_and_indices()
 
-            armap = AccessRangeMapper(knl, assignee_name)
-            armap(insn.assignee, knl.insn_inames(insn))
-
-            if armap.access_range is not None:
-                base_indices, shape = zip(*[
-                        knl.cache_manager.base_index_and_length(
-                            armap.access_range, i)
-                        for i in xrange(armap.access_range.dim(dim_type.set))])
-            else:
-                base_indices = ()
-                shape = ()
-
             if assignee_name in new_temp_vars:
                 raise RuntimeError("cannot create temporary variable '%s'--"
                         "already exists" % assignee_name)
@@ -686,12 +668,15 @@ def create_temporaries(knl):
                 raise RuntimeError("cannot create temporary variable '%s'--"
                         "already exists as argument" % assignee_name)
 
-            new_temp_vars[assignee_name] = TemporaryVariable(
+            logger.debug("%s: creating temporary %s"
+                    % (knl.name, assignee_name))
+
+            new_temp_vars[assignee_name] = lp.TemporaryVariable(
                     name=assignee_name,
                     dtype=insn.temp_var_type,
-                    is_local=None,
-                    base_indices=base_indices,
-                    shape=shape)
+                    is_local=lp.auto,
+                    base_indices=lp.auto,
+                    shape=lp.auto)
 
             insn = insn.copy(temp_var_type=None)
 
@@ -704,29 +689,43 @@ def create_temporaries(knl):
 # }}}
 
 
-# {{{ check for reduction iname duplication
+# {{{ determine shapes of temporaries
+
+def determine_shapes_of_temporaries(knl):
+    new_temp_vars = knl.temporary_variables.copy()
 
-def check_for_reduction_inames_duplication_requests(kernel):
+    from loopy.symbolic import AccessRangeMapper
+    from pymbolic import var
+    import loopy as lp
 
-    # {{{ helper function
+    new_temp_vars = {}
+    for tv in knl.temporary_variables.itervalues():
+        if tv.shape is lp.auto or tv.base_indices is lp.auto:
+            armap = AccessRangeMapper(knl, tv.name)
+            for insn in knl.instructions:
+                for assignee_name, assignee_index in insn.assignees_and_indices():
+                    if assignee_index:
+                        armap(var(assignee_name)[assignee_index],
+                                knl.insn_inames(insn))
 
-    def check_reduction_inames(reduction_expr, rec):
-        for iname in reduction_expr.inames:
-            if iname.startswith("@"):
-                raise RuntimeError(
-                        "Reduction iname duplication with '@' is no "
-                        "longer supported. Use loopy.duplicate_inames "
-                        "instead.")
+            if armap.access_range is not None:
+                base_indices, shape = zip(*[
+                        knl.cache_manager.base_index_and_length(
+                            armap.access_range, i)
+                        for i in xrange(armap.access_range.dim(dim_type.set))])
+            else:
+                base_indices = ()
+                shape = ()
 
-    # }}}
+            if tv.base_indices is lp.auto:
+                tv = tv.copy(base_indices=base_indices)
+            if tv.shape is lp.auto:
+                tv = tv.copy(shape=shape)
 
-    from loopy.symbolic import ReductionCallbackMapper
-    rcm = ReductionCallbackMapper(check_reduction_inames)
-    for insn in kernel.instructions:
-        rcm(insn.expression)
+        new_temp_vars[tv.name] = tv
 
-    for sub_name, sub_rule in kernel.substitutions.iteritems():
-        rcm(sub_rule.expression)
+    return knl.copy(
+            temporary_variables=new_temp_vars)
 
 # }}}
 
@@ -767,10 +766,11 @@ def guess_arg_shape_if_requested(kernel, default_order):
             armap = AccessRangeMapper(kernel, arg.name)
 
             for insn in kernel.instructions:
-                armap(submap(insn.assignee, insn.id),
-                        kernel.insn_inames(insn))
-                armap(submap(insn.expression, insn.id),
-                        kernel.insn_inames(insn))
+                if isinstance(insn, lp.ExpressionInstruction):
+                    armap(submap(insn.assignee, insn.id),
+                            kernel.insn_inames(insn))
+                    armap(submap(insn.expression, insn.id),
+                            kernel.insn_inames(insn))
 
             if armap.access_range is None:
                 # no subscripts found, let's call it a scalar
@@ -832,13 +832,20 @@ def apply_default_order_to_args(kernel, default_order):
 
 # {{{ kernel creation top-level
 
-def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs):
+def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs):
     """User-facing kernel creation entrypoint.
 
     :arg device: :class:`pyopencl.Device`
     :arg domains: :class:`islpy.BasicSet`
     :arg instructions:
-    :arg kernel_args:
+    :arg kernel_data:
+
+        A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances.
+        The order of these arguments determines the order of the arguments
+        to the generated kernel.
+
+        May also contain :class:`TemporaryVariable` instances(which do not
+        give rise to kernel-level arguments).
 
     The following keyword arguments are recognized:
 
@@ -873,13 +880,28 @@ def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs):
     :arg local_sizes: A dictionary from integers to integers, mapping
         workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be
         length 16.
-    :arg temporary_variables:
     """
 
     defines = kwargs.pop("defines", {})
     default_order = kwargs.pop("default_order", "C")
     default_offset = kwargs.pop("default_offset", 0)
 
+    # {{{ separate temporary variables and arguments
+
+    from loopy.kernel.data import TemporaryVariable
+
+    kernel_args = []
+    temporary_variables = {}
+    for dat in kernel_data:
+        if isinstance(dat, TemporaryVariable):
+            temporary_variables[dat.name] = dat
+        else:
+            kernel_args.append(dat)
+
+    del kernel_data
+
+    # }}}
+
     # {{{ instruction/subst parsing
 
     parsed_instructions = []
@@ -916,18 +938,19 @@ def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs):
     domains = parse_domains(isl_context, domains, defines)
 
     kernel_args = guess_kernel_args_if_requested(domains, instructions,
-            kwargs.get("temporary_variables", {}), substitutions,
+            temporary_variables, substitutions,
             duplicate_args_with_commas(kernel_args),
             default_offset)
 
     from loopy.kernel import LoopKernel
-    knl = LoopKernel(device, domains, instructions, kernel_args, **kwargs)
+    knl = LoopKernel(device, domains, instructions, kernel_args,
+            temporary_variables=temporary_variables, **kwargs)
 
     check_for_nonexistent_iname_deps(knl)
-    check_for_reduction_inames_duplication_requests(knl)
 
     knl = tag_reduction_inames_as_sequential(knl)
     knl = create_temporaries(knl)
+    knl = determine_shapes_of_temporaries(knl)
     knl = expand_cses(knl)
     knl = expand_defines_in_shapes(knl, defines)
     knl = guess_arg_shape_if_requested(knl, default_order)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index c0dcd036f..c844995d6 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -30,6 +30,13 @@ from pytools import Record, memoize_method
 from loopy.kernel.array import ArrayBase
 
 
+class auto:
+    """A generic placeholder object for something that should be automatically
+    detected.  See, for example, the *shape* or *strides* argument of
+    :class:`GlobalArg`.
+    """
+
+
 # {{{ iname tags
 
 class IndexTag(Record):
@@ -231,6 +238,10 @@ class TemporaryVariable(ArrayBase):
     .. attribute:: storage_shape
     .. attribute:: base_indices
     .. attribute:: is_local
+
+        Whether this is temporary lives in ``local`` memory.
+        May be *True*, *False*, or :class:`loopy.auto` if this is
+        to be automatically determined.
     """
 
     min_target_axes = 0
@@ -242,9 +253,19 @@ class TemporaryVariable(ArrayBase):
             "is_local"
             ]
 
-    def __init__(self, name, dtype, shape, is_local,
+    def __init__(self, name, dtype, shape=(), is_local=auto,
             dim_tags=None, offset=0, strides=None, order=None,
             base_indices=None, storage_shape=None):
+        """
+        :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
+        :arg shape: :class:`loopy.auto` or a shape tuple
+        :arg base_indices: :class:`loopy.auto` or a tuple of base indices
+        """
+
+        if is_local is None:
+            raise ValueError("is_local is None is no longer supported. "
+                    "Use loopy.auto.")
+
         if base_indices is None:
             base_indices = (0,) * len(shape)
 
@@ -372,6 +393,13 @@ class InstructionBase(Record):
         """
         raise NotImplementedError
 
+    def with_transformed_expressions(self, f, *args):
+        """Return a new copy of *self* where *f* has been applied to every
+        expression occurring in *self*. *args* will be passed as extra
+        arguments (in addition to the expression) to *f*.
+        """
+        raise NotImplementedError
+
     # }}}
 
     @memoize_method
@@ -387,7 +415,7 @@ class InstructionBase(Record):
             from loopy.symbolic import get_dependencies
             result.update(get_dependencies(indices))
 
-        return result
+        return frozenset(result)
 
     def dependency_names(self):
         return self.read_dependency_names() | self.write_dependency_names()
@@ -395,6 +423,45 @@ class InstructionBase(Record):
     def assignee_var_names(self):
         return (var_name for var_name, _ in self.assignees_and_indices())
 
+    def get_str_options(self):
+        result = []
+
+        if self.boostable is True:
+            if self.boostable_into:
+                result.append("boostable into '%s'" % ",".join(self.boostable_into))
+            else:
+                result.append("boostable")
+        elif self.boostable is False:
+            result.append("not boostable")
+        elif self.boostable is None:
+            pass
+        else:
+            raise RuntimeError("unexpected value for Instruction.boostable")
+
+        if self.insn_deps:
+            result.append("deps="+":".join(self.insn_deps))
+        if self.priority:
+            result.append("priority=%d" % self.priority)
+
+        return result
+
+
+def _get_assignee_and_index(expr):
+    from pymbolic.primitives import Variable, Subscript
+    if isinstance(expr, Variable):
+        return (expr.name, ())
+    elif isinstance(expr, Subscript):
+        agg = expr.aggregate
+        assert isinstance(agg, Variable)
+
+        idx = expr.index
+        if not isinstance(idx, tuple):
+            idx = (idx,)
+
+        return (agg.name, idx)
+    else:
+        raise RuntimeError("invalid lvalue '%s'" % expr)
+
 
 class ExpressionInstruction(InstructionBase):
     """
@@ -402,7 +469,7 @@ class ExpressionInstruction(InstructionBase):
 
     .. attribute:: expression
 
-    The following instance variables are only used until
+    The following attributes are only used until
     :func:`loopy.make_kernel` is finished:
 
     .. attribute:: temp_var_type
@@ -415,8 +482,8 @@ class ExpressionInstruction(InstructionBase):
             set("assignee expression temp_var_type".split())
 
     def __init__(self,
-            id, assignee, expression,
-            forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
+            assignee, expression,
+            id=None, forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
             boostable_into=None,
             temp_var_type=None, priority=0):
 
@@ -461,24 +528,12 @@ class ExpressionInstruction(InstructionBase):
 
     @memoize_method
     def assignees_and_indices(self):
-        from pymbolic.primitives import Variable, Subscript
-
-        if isinstance(self.assignee, Variable):
-            return [(self.assignee.name, ())]
-        elif isinstance(self.assignee, Subscript):
-            agg = self.assignee.aggregate
-            assert isinstance(agg, Variable)
-            var_name = agg.name
-
-            idx = self.assignee.index
-            if not isinstance(idx, tuple):
-                idx = (idx,)
+        return [_get_assignee_and_index(self.assignee)]
 
-            return [(agg.name, idx)]
-        else:
-            raise RuntimeError("invalid lvalue '%s'" % self.assignee)
-
-        return var_name
+    def with_transformed_expressions(self, f, *args):
+        return self.copy(
+                assignee=f(self.assignee, *args),
+                expression=f(self.expression, *args))
 
     # }}}
 
@@ -486,30 +541,177 @@ class ExpressionInstruction(InstructionBase):
         result = "%s: %s <- %s" % (self.id,
                 self.assignee, self.expression)
 
-        if self.boostable is True:
-            if self.boostable_into:
-                result += " (boostable into '%s')" % ",".join(self.boostable_into)
-            else:
-                result += " (boostable)"
-        elif self.boostable is False:
-            result += " (not boostable)"
-        elif self.boostable is None:
-            pass
-        else:
-            raise RuntimeError("unexpected value for Instruction.boostable")
+        options = self.get_str_options()
+        if options:
+            result += " (%s)" % (": ".join(options))
 
-        options = []
+        return result
 
-        if self.insn_deps:
-            options.append("deps="+":".join(self.insn_deps))
-        if self.priority:
-            options.append("priority=%d" % self.priority)
 
-        return result
+def _remove_common_indentation(code):
+    if not "\n" in code:
+        return code
+
+    # accommodate pyopencl-ish syntax highlighting
+    code = code.lstrip("//CL//")
+
+    if not code.startswith("\n"):
+        raise ValueError("expected newline as first character "
+                "in literal lines")
+
+    lines = code.split("\n")
+    while lines[0].strip() == "":
+        lines.pop(0)
+    while lines[-1].strip() == "":
+        lines.pop(-1)
+
+    if lines:
+        base_indent = 0
+        while lines[0][base_indent] in " \t":
+            base_indent += 1
+
+        for line in lines[1:]:
+            if line[:base_indent].strip():
+                raise ValueError("inconsistent indentation")
+
+    return "\n".join(line[base_indent:] for line in lines)
 
 
 class CInstruction(InstructionBase):
-    pass
+    """
+    .. atttribute:: iname_exprs
+
+        A list of tuples *(name, expr)* of inames or expressions based on them
+        that the instruction needs access to.
+
+    .. attribute:: code
+
+        The C code to be executed.
+
+        The code should obey the following rules:
+
+        * It should only write to temporary variables, specifically the
+          temporary variables
+
+        .. note::
+
+            Of course, nothing in :mod:`loopy` will prevent you from doing
+            'forbidden' things in your C code. If you ignore the rules and
+            something breaks, you get to keep both pieces.
+
+    .. attribute:: read_variables
+
+        A :class:`frozenset` of variable names that :attr:`code` reads. This is
+        optional and only used for figuring out dependencies.
+
+    .. attribute:: assignees
+
+        A sequence of variable references (with or without subscript) as
+        :class:`pymbolic.primitives.Expression` instances that :attr:`code`
+        writes to. This is optional and only used for figuring out dependencies.
+    """
+
+    fields = InstructionBase.fields | \
+            set("iname_exprs code read_variables assignees".split())
+
+    def __init__(self,
+            iname_exprs, code,
+            read_variables=frozenset(), assignees=frozenset(),
+            id=None, insn_deps=set(), forced_iname_deps=frozenset(), priority=0,
+            boostable=None, boostable_into=None):
+        """
+        :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples,
+            simple strings pepresenting inames are also allowed. A single
+            string is also allowed, which should consists of comma-separated
+            inames.
+        :arg assignees: Like :attr:`assignees`, but may also be a
+            semicolon-separated string of such expressions or a
+            sequence of strings parseable into the desired format.
+        """
+
+        InstructionBase.__init__(self,
+                id=id,
+                forced_iname_deps=forced_iname_deps,
+                insn_deps=insn_deps, boostable=boostable,
+                boostable_into=boostable_into,
+                priority=priority)
+
+        # {{{ normalize iname_exprs
+
+        if isinstance(iname_exprs, str):
+            iname_exprs = [i.strip() for i in iname_exprs.split(",")]
+            iname_exprs = [i for i in iname_exprs if i]
+
+        from pymbolic import var
+        new_iname_exprs = []
+        for i in iname_exprs:
+            if isinstance(i, str):
+                new_iname_exprs.append((i, var(i)))
+            else:
+                new_iname_exprs.append(i)
+
+        # }}}
+
+        # {{{ normalize assignees
+
+        if isinstance(assignees, str):
+            assignees = [i.strip() for i in assignees.split(";")]
+            assignees = [i for i in assignees if i]
+
+        new_assignees = []
+        from loopy.symbolic import parse
+        for i in assignees:
+            if isinstance(i, str):
+                new_assignees.append(parse(i))
+            else:
+                new_assignees.append(i)
+        # }}}
+
+        self.iname_exprs = new_iname_exprs
+        self.code = _remove_common_indentation(code)
+        self.read_variables = read_variables
+        self.assignees = new_assignees
+
+    # {{{ abstract interface
+
+    def read_dependency_names(self):
+        result = set(self.read_variables)
+
+        from loopy.symbolic import get_dependencies
+        for name, iname_expr in self.iname_exprs:
+            result.update(get_dependencies(iname_expr))
+
+        return frozenset(result)
+
+    def reduction_inames(self):
+        return set()
+
+    def assignees_and_indices(self):
+        return [_get_assignee_and_index(expr)
+                for expr in self.assignees]
+
+    def with_transformed_expressions(self, f, *args):
+        return self.copy(
+                iname_exprs=[
+                    (name, f(expr, *args))
+                    for name, expr in self.iname_exprs],
+                assignees=[f(a, *args) for a in self.assignees])
+
+    # }}}
+
+    def __str__(self):
+        first_line = "%s: %s <- CODE(%s|%s)" % (self.id,
+                ", ".join(str(a) for a in self.assignees),
+                ", ".join(str(x) for x in self.read_variables),
+                ", ".join("%s=%s" % (name, expr)
+                    for name, expr in self.iname_exprs))
+
+        options = self.get_str_options()
+        if options:
+            first_line += " (%s)" % (": ".join(options))
+
+        return first_line + "\n    " + "\n    ".join(
+                self.code.split("\n"))
 
 # }}}
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 51cc266db..72baba59d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -99,6 +99,10 @@ def find_all_insn_inames(kernel):
                 deps & kernel.all_inames()
                 | insn.forced_iname_deps)
 
+        assert isinstance(read_deps, frozenset), type(insn)
+        assert isinstance(write_deps, frozenset), type(insn)
+        assert isinstance(iname_deps, frozenset), type(insn)
+
         insn_id_to_inames[insn.id] = iname_deps
         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
 
@@ -174,6 +178,9 @@ def find_all_insn_inames(kernel):
         if not did_something:
             break
 
+    for v in insn_id_to_inames.itervalues():
+        assert isinstance(v, frozenset)
+
     return insn_id_to_inames
 
 # }}}
diff --git a/loopy/precompute.py b/loopy/precompute.py
index 1c6634b13..3b33e895a 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -672,8 +672,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     invg = InvocationGatherer(kernel, subst_name, subst_tag, within)
 
+    import loopy as lp
     for insn in kernel.instructions:
-        invg(insn.expression, insn.id)
+        if isinstance(insn, lp.ExpressionInstruction):
+            invg(insn.expression, insn.id)
 
     for invdesc in invg.invocation_descriptors:
         invocation_descriptors.append(
@@ -857,15 +859,15 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     else:
         dtype = np.dtype(dtype)
 
-    from loopy.kernel.data import TemporaryVariable
+    import loopy as lp
 
     new_temporary_variables = kernel.temporary_variables.copy()
-    temp_var = TemporaryVariable(
+    temp_var = lp.TemporaryVariable(
             name=target_var_name,
             dtype=dtype,
             base_indices=(0,)*len(non1_storage_shape),
             shape=tuple(non1_storage_shape),
-            is_local=None)
+            is_local=lp.auto)
 
     new_temporary_variables[target_var_name] = temp_var
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f71d98bba..d41a36405 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 import pyopencl as cl
 import pyopencl.characterize as cl_char
+from loopy.diagnostic import LoopyError, LoopyWarning
 
 import logging
 logger = logging.getLogger(__name__)
@@ -36,31 +37,38 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     if var_name in kernel.all_params():
         return kernel.index_dtype
 
+    def debug(s):
+        logger.debug("%s: %s" % (kernel.name, s))
+
     dtypes = []
 
+    import loopy as lp
+
     from loopy.codegen.expression import DependencyTypeInferenceFailure
     for writer_insn_id in kernel.writer_map().get(var_name, []):
-        expr = subst_expander(
-                kernel.id_to_insn[writer_insn_id].expression,
-                insn_id=writer_insn_id)
+        writer_insn = kernel.id_to_insn[writer_insn_id]
+        if not isinstance(writer_insn, lp.ExpressionInstruction):
+            continue
+
+        expr = subst_expander(writer_insn.expression, insn_id=writer_insn_id)
 
         try:
-            logger.debug("             via expr %s" % expr)
+            debug("             via expr %s" % expr)
             result = type_inf_mapper(expr)
 
-            logger.debug("             result: %s" % result)
+            debug("             result: %s" % result)
 
             dtypes.append(result)
 
         except DependencyTypeInferenceFailure, e:
-            logger.debug("             failed: %s" % e)
+            debug("             failed: %s" % e)
 
     if not dtypes:
         return None
 
     from pytools import is_single_valued
     if not is_single_valued(dtypes):
-        raise RuntimeError("ambiguous type inference for '%s'"
+        raise LoopyError("ambiguous type inference for '%s'"
                 % var_name)
 
     return dtypes[0]
@@ -89,6 +97,20 @@ class _DictUnionView:
 def infer_unknown_types(kernel, expect_completion=False):
     """Infer types on temporaries and argumetns."""
 
+    logger.debug("%s: infer types" % kernel.name)
+
+    def debug(s):
+        logger.debug("%s: %s" % (kernel.name, s))
+
+    if kernel.substitutions:
+        from warnings import warn
+        warn("type inference called when substitution "
+                "rules are still unexpanded, expanding",
+                LoopyWarning, stacklevel=2)
+
+        from loopy.subst import expand_subst
+        kernel = expand_subst(kernel)
+
     new_temp_vars = kernel.temporary_variables.copy()
     new_arg_dict = kernel.arg_dict.copy()
 
@@ -127,27 +149,27 @@ def infer_unknown_types(kernel, expect_completion=False):
     while queue:
         item = queue.pop(0)
 
-        logger.debug("inferring type for %s %s" % (type(item).__name__, item.name))
+        debug("inferring type for %s %s" % (type(item).__name__, item.name))
 
         result = _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander)
 
         failed = result is None
         if not failed:
-            logger.debug("     success: %s" % result)
+            debug("     success: %s" % result)
             if isinstance(item, TemporaryVariable):
                 new_temp_vars[item.name] = item.copy(dtype=result)
             elif isinstance(item, KernelArgument):
                 new_arg_dict[item.name] = item.copy(dtype=result)
             else:
-                raise RuntimeError("unexpected item type in type inference")
+                raise LoopyError("unexpected item type in type inference")
         else:
-            logger.debug("     failure")
+            debug("     failure")
 
         if failed:
             if item.name in failed_names:
                 # this item has failed before, give up.
                 if expect_completion:
-                    raise RuntimeError(
+                    raise LoopyError(
                             "could not determine type of '%s'" % item.name)
                 else:
                     # We're done here.
@@ -183,8 +205,11 @@ def infer_unknown_types(kernel, expect_completion=False):
 # {{{ decide which temporaries are local
 
 def mark_local_temporaries(kernel):
+    logger.debug("%s: mark local temporaries" % kernel.name)
+
     new_temp_vars = {}
     from loopy.kernel.data import LocalIndexTagBase
+    import loopy as lp
 
     writers = kernel.writer_map()
 
@@ -194,7 +219,7 @@ def mark_local_temporaries(kernel):
         # Only fill out for variables that do not yet know if they're
         # local. (I.e. those generated by implicit temporary generation.)
 
-        if temp_var.is_local is not None:
+        if temp_var.is_local is not lp.auto:
             new_temp_vars[temp_var.name] = temp_var
             continue
 
@@ -251,7 +276,7 @@ def mark_local_temporaries(kernel):
         is_local = wants_to_be_local_per_insn[0]
         from pytools import all
         if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn):
-            raise RuntimeError("not all instructions agree on whether "
+            raise LoopyError("not all instructions agree on whether "
                     "temporary '%s' should be in local memory" % temp_var.name)
 
         new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local)
@@ -276,6 +301,8 @@ def realize_reduction(kernel, insn_id_filter=None):
     be realized.
     """
 
+    logger.debug("%s: realize reduction" % kernel.name)
+
     new_insns = []
 
     var_name_gen = kernel.get_var_name_generator()
@@ -306,7 +333,7 @@ def realize_reduction(kernel, insn_id_filter=None):
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = set(expr.inames) & outer_insn_inames
         if bad_inames:
-            raise RuntimeError("reduction used within loop(s) that it was "
+            raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
         new_id = temp_kernel.make_unique_instruction_id(
@@ -346,13 +373,15 @@ def realize_reduction(kernel, insn_id_filter=None):
 
     temp_kernel = kernel
 
+    import loopy as lp
     while insn_queue:
         new_insn_insn_deps = set()
         generated_insns = []
 
         insn = insn_queue.pop(0)
 
-        if insn_id_filter is not None and insn.id != insn_id_filter:
+        if insn_id_filter is not None and insn.id != insn_id_filter \
+                or not isinstance(insn, lp.ExpressionInstruction):
             new_insns.append(insn)
             continue
 
@@ -423,10 +452,11 @@ class ExtraInameIndexInserter(IdentityMapper):
 
 
 def duplicate_private_temporaries_for_ilp(kernel):
+    logger.debug("%s: duplicate temporaries for ilp" % kernel.name)
+
     wmap = kernel.writer_map()
 
     from loopy.kernel.data import IlpBaseTag
-    from loopy.symbolic import get_dependencies
 
     var_to_new_ilp_inames = {}
 
@@ -440,13 +470,13 @@ def duplicate_private_temporaries_for_ilp(kernel):
                     if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag))
 
             referenced_ilp_inames = (ilp_inames
-                    & get_dependencies(writer_insn.assignee))
+                    & writer_insn.write_dependency_names())
 
             new_ilp_inames = ilp_inames - referenced_ilp_inames
 
             if tv.name in var_to_new_ilp_inames:
                 if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
-                    raise RuntimeError("instruction '%s' requires adding "
+                    raise LoopyError("instruction '%s' requires adding "
                             "indices for ILP inames '%s' on var '%s', but previous "
                             "instructions required inames '%s'"
                             % (writer_insn_id, ", ".join(new_ilp_inames),
@@ -502,9 +532,7 @@ def duplicate_private_temporaries_for_ilp(kernel):
                 for var_name, inames in var_to_new_ilp_inames.iteritems()))
 
     new_insns = [
-            insn.copy(
-                assignee=eiii(insn.assignee),
-                expression=eiii(insn.expression))
+            insn.with_transformed_expressions(eiii)
             for insn in kernel.instructions]
 
     return kernel.copy(
@@ -517,6 +545,8 @@ def duplicate_private_temporaries_for_ilp(kernel):
 # {{{ automatic dependencies, find boostability of instructions
 
 def add_boostability_and_automatic_dependencies(kernel):
+    logger.debug("%s: automatic deps, boostability" % kernel.name)
+
     writer_map = kernel.writer_map()
 
     arg_names = set(arg.name for arg in kernel.args)
@@ -604,6 +634,8 @@ def limit_boostability(kernel):
     and then limits boostability to just those inames.
     """
 
+    logger.debug("%s: limit boostability" % kernel.name)
+
     iname_occurs_with = {}
     for insn in kernel.instructions:
         insn_inames = kernel.insn_inames(insn)
@@ -621,7 +653,7 @@ def limit_boostability(kernel):
     new_insns = []
     for insn in kernel.instructions:
         if insn.boostable is None:
-            raise RuntimeError("insn '%s' has undetermined boostability" % insn.id)
+            raise LoopyError("insn '%s' has undetermined boostability" % insn.id)
         elif insn.boostable:
             boostable_into = set()
             for iname in kernel.insn_inames(insn):
@@ -654,7 +686,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
             if arg.approximately is not None:
                 approximate_arg_values[arg.name] = arg.approximately
             else:
-                raise RuntimeError("No approximate arg value specified for '%s'"
+                raise LoopyError("No approximate arg value specified for '%s'"
                         % arg.name)
 
     # {{{ find all array accesses in insn
@@ -762,6 +794,8 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # {{{ assign automatic axes
 
 def assign_automatic_axes(kernel, axis=0, local_size=None):
+    logger.debug("%s: assign automatic axes" % kernel.name)
+
     from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
 
     # Realize that at this point in time, axis lengths are already
@@ -831,7 +865,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                         axis=recursion_axis, local_size=local_size)
 
         if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
-            raise RuntimeError("trying to reassign '%s'" % iname)
+            raise LoopyError("trying to reassign '%s'" % iname)
 
         new_iname_to_tag = kernel.iname_to_tag.copy()
         new_iname_to_tag[iname] = new_tag
@@ -845,7 +879,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     # assignment proceeds in one phase per axis, each time assigning the
     # smallest-stride available iname to the current axis
 
+    import loopy as lp
+
     for insn in kernel.instructions:
+        if not isinstance(insn, lp.ExpressionInstruction):
+            continue
+
         auto_axis_inames = [
                 iname
                 for iname in kernel.insn_inames(insn)
@@ -900,6 +939,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 # {{{ temp storage adjust for bank conflict
 
 def adjust_local_temp_var_storage(kernel):
+    logger.debug("%s: adjust temp var storage" % kernel.name)
+
     new_temp_vars = {}
 
     lmem_size = cl_char.usable_local_mem_size(kernel.device)
@@ -975,49 +1016,35 @@ def adjust_local_temp_var_storage(kernel):
 
 
 def preprocess_kernel(kernel):
-    logger.info("preprocess %s: start" % kernel.name)
+    logger.info("%s: preprocess start" % kernel.name)
 
     from loopy.subst import expand_subst
-    logger.debug("preprocess %s: expand subst" % kernel.name)
     kernel = expand_subst(kernel)
 
     # Ordering restriction:
     # Type inference doesn't handle substitutions. Get them out of the
     # way.
 
-    logger.debug("preprocess %s: infer types" % kernel.name)
     kernel = infer_unknown_types(kernel, expect_completion=False)
 
     # Ordering restriction:
     # realize_reduction must happen after type inference because it needs
     # to be able to determine the types of the reduced expressions.
 
-    logger.debug("preprocess %s: realize reduction" % kernel.name)
     kernel = realize_reduction(kernel)
 
     # Ordering restriction:
     # duplicate_private_temporaries_for_ilp because reduction accumulators
     # need to be duplicated by this.
 
-    logger.debug("preprocess %s: duplicate temporaries for ilp" % kernel.name)
     kernel = duplicate_private_temporaries_for_ilp(kernel)
-
-    logger.debug("preprocess %s: mark local temporaries" % kernel.name)
     kernel = mark_local_temporaries(kernel)
-
-    logger.debug("preprocess %s: assign automatic axes" % kernel.name)
     kernel = assign_automatic_axes(kernel)
-
-    logger.debug("preprocess %s: automatic deps, boostability" % kernel.name)
     kernel = add_boostability_and_automatic_dependencies(kernel)
-
-    logger.debug("preprocess %s: limit boostability" % kernel.name)
     kernel = limit_boostability(kernel)
-
-    logger.debug("preprocess %s: adjust temp var storage" % kernel.name)
     kernel = adjust_local_temp_var_storage(kernel)
 
-    logger.info("preprocess %s: done" % kernel.name)
+    logger.info("%s: preprocess done" % kernel.name)
 
     return kernel
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index c60505204..93565cde1 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -794,10 +794,10 @@ def generate_loop_schedules(kernel, debug_args={}):
     from loopy.preprocess import preprocess_kernel
     kernel = preprocess_kernel(kernel)
 
-    from loopy.check import run_automatic_checks
-    run_automatic_checks(kernel)
+    from loopy.check import pre_schedule_checks
+    pre_schedule_checks(kernel)
 
-    logger.info("schedule %s: start" % kernel.name)
+    logger.info("%s: schedule start" % kernel.name)
 
     schedule_count = 0
 
@@ -874,7 +874,7 @@ def generate_loop_schedules(kernel, debug_args={}):
 
         raise RuntimeError("no valid schedules found")
 
-    logger.info("schedule %s: done" % kernel.name)
+    logger.info("%s: schedule done" % kernel.name)
 
 # }}}
 
diff --git a/loopy/subst.py b/loopy/subst.py
index 2c3669c83..171a2daa6 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -30,6 +30,10 @@ from pytools import Record
 from pymbolic import var
 
 
+import logging
+logger = logging.getLogger(__name__)
+
+
 class ExprDescriptor(Record):
     __slots__ = ["insn", "expr", "unif_var_dict"]
 
@@ -184,12 +188,18 @@ def extract_subst(kernel, subst_name, template, parameters):
 
 
 def expand_subst(kernel, ctx_match=None):
+    logger.debug("%s: expand subst" % kernel.name)
+
     from loopy.symbolic import SubstitutionRuleExpander
     from loopy.context_matching import parse_stack_match
     submap = SubstitutionRuleExpander(kernel.substitutions,
             kernel.get_var_name_generator(),
             parse_stack_match(ctx_match))
 
-    return submap.map_kernel(kernel)
+    kernel = submap.map_kernel(kernel)
+    if ctx_match is None:
+        return kernel.copy(substitutions={})
+    else:
+        return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 2352269d4..81837f98d 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -330,8 +330,9 @@ class SubstitutionRuleRenamer(IdentityMapper):
 
 def rename_subst_rules_in_instructions(insns, renames):
     subst_renamer = SubstitutionRuleRenamer(renames)
+
     return [
-            insn.copy(expression=subst_renamer(insn.expression))
+            insn.with_transformed_expressions(subst_renamer)
             for insn in insns]
 
 
@@ -486,14 +487,11 @@ class ExpandingIdentityMapper(IdentityMapper):
 
     def map_kernel(self, kernel):
         new_insns = [
-                insn.copy(
-                    # While subst rules are not allowed in assignees, the mapper
-                    # may perform tasks entirely unrelated to subst rules, so
-                    # we must map assignees, too.
-                    assignee=self(insn.assignee, insn.id),
-
-                    expression=self(insn.expression, insn.id))
+                # While subst rules are not allowed in assignees, the mapper
+                # may perform tasks entirely unrelated to subst rules, so
+                # we must map assignees, too.
 
+                insn.with_transformed_expressions(self, insn.id)
                 for insn in kernel.instructions]
 
         new_substs, renames = self._get_new_substitutions_and_renames()
diff --git a/test/test_dg.py b/test/test_dg.py
index 8de1c8eec..956bee2d7 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -27,11 +27,15 @@ import numpy as np
 import pyopencl as cl
 import loopy as lp
 
+import logging  # noqa
+
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
 
 def test_dg_volume(ctx_factory):
+    #logging.basicConfig(level=logging.DEBUG)
+
     dtype = np.float32
     dtype4 = cl.array.vec.float4
     ctx = ctx_factory()
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 5407fea58..ea222bc82 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -481,7 +481,7 @@ def test_fuzz_code_generator(ctx_factory):
                 return np.float64
 
         knl = lp.make_kernel(ctx.devices[0], "{ : }",
-                [lp.ExpressionInstruction(None, "value", expr)],
+                [lp.ExpressionInstruction("value", expr)],
                 [lp.GlobalArg("value", np.complex128, shape=())]
                 + [
                     lp.ValueArg(name, get_dtype(val))
@@ -1165,6 +1165,32 @@ def test_convolution_like(ctx_factory):
             parameters={"im_w": 1024, "im_h": 1024, "f_w": 7})
 
 
+def test_c_instruction(ctx_factory):
+    logging.basicConfig(level=logging.DEBUG)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(ctx.devices[0], [
+            "{[i,j]: 0<=i,j<n }",
+            ],
+            [
+                lp.CInstruction("i", """
+                    x = sin((float) i);
+                    """, assignees="x"),
+                "a[i*i] = x",
+                ],
+            [
+                lp.GlobalArg("a", shape="n"),
+                lp.ValueArg("n"),
+                lp.TemporaryVariable("x", np.float32),
+                ],
+            assumptions="n>=1")
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    print knl
+    print lp.CompiledKernel(ctx, knl).get_highlighted_code()
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
-- 
GitLab