diff --git a/loopy/__init__.py b/loopy/__init__.py
index 9766aa37047111bddf15b61a1d14d628a6175d04..41ce634c4ced81971010f2bd319bb0ab155ea5e7 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -49,13 +49,18 @@ class LoopyAdvisory(UserWarning):
 
 # {{{ imported user interface
 
-from loopy.kernel import ValueArg, ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg
+from loopy.kernel.data import (
+        ValueArg, ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg,
 
-from loopy.kernel import (AutoFitLocalIndexTag, get_dot_dependency_graph,
-        LoopKernel, Instruction,
         default_function_mangler, single_arg_function_mangler, opencl_function_mangler,
-        default_preamble_generator)
-from loopy.creation import make_kernel
+
+        default_preamble_generator,
+
+        Instruction)
+
+from loopy.kernel import LoopKernel
+from loopy.kernel.tools import get_dot_dependency_graph
+from loopy.kernel.creation import make_kernel
 from loopy.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst
 from loopy.cse import precompute
@@ -137,7 +142,7 @@ def split_iname(kernel, split_iname, inner_length,
     """
 
     existing_tag = kernel.iname_to_tag.get(split_iname)
-    from loopy.kernel import ForceSequentialTag
+    from loopy.kernel.data import ForceSequentialTag
     if do_tagged_check and (
             existing_tag is not None
             and not isinstance(existing_tag, ForceSequentialTag)):
@@ -289,7 +294,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
     if new_iname is None:
         new_iname = kernel.get_var_name_generator()("_and_".join(inames))
 
-    from loopy.kernel import DomainChanger
+    from loopy.kernel.tools import DomainChanger
     domch = DomainChanger(kernel, frozenset(inames))
     for iname in inames:
         if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
@@ -389,12 +394,12 @@ join_dimensions = MovedFunctionDeprecationWrapper(join_inames)
 # {{{ tag inames
 
 def tag_inames(kernel, iname_to_tag, force=False):
-    from loopy.kernel import parse_tag
+    from loopy.kernel.data import parse_tag
 
     iname_to_tag = dict((iname, parse_tag(tag))
             for iname, tag in iname_to_tag.iteritems())
 
-    from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase,
+    from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
             ForceSequentialTag)
 
     new_iname_to_tag = kernel.iname_to_tag.copy()
@@ -517,7 +522,7 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
     # {{{ duplicate the inames
 
     for old_iname, new_iname in zip(inames, new_inames):
-        from loopy.kernel import DomainChanger
+        from loopy.kernel.tools import DomainChanger
         domch = DomainChanger(knl, frozenset([old_iname]))
 
         from loopy.isl_helpers import duplicate_axes
@@ -597,7 +602,7 @@ def link_inames(knl, inames, new_iname, within=None, tag=None):
 
     # }}}
 
-    from loopy.kernel import DomainChanger
+    from loopy.kernel.tools import DomainChanger
     domch = DomainChanger(knl, tuple(inames))
 
     # {{{ ensure that projections are identical
@@ -705,7 +710,7 @@ def remove_unused_inames(knl, inames=None):
 
     # {{{ remove them
 
-    from loopy.kernel import DomainChanger
+    from loopy.kernel.tools import DomainChanger
 
     for iname in unused_inames:
         domch = DomainChanger(knl, (iname,))
@@ -727,7 +732,7 @@ def remove_unused_inames(knl, inames=None):
 # {{{ process footprint_subscripts
 
 def _add_kernel_axis(kernel, axis_name, start, stop, base_inames):
-    from loopy.kernel import DomainChanger
+    from loopy.kernel.tools import DomainChanger
     domch = DomainChanger(kernel, base_inames)
 
     domain = domch.domain
diff --git a/loopy/check.py b/loopy/check.py
index 65962a8eb4059d7403d06e32ab2684446743e0b5..093ddfde967e4b44aaa2e68ef8ce6cf15331509f 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -42,7 +42,7 @@ def check_for_unused_hw_axes_in_insns(kernel):
 
     # alternative: just disregard length-1 dimensions?
 
-    from loopy.kernel import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag
+    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag
     for insn in kernel.instructions:
         if insn.boostable:
             continue
@@ -78,7 +78,7 @@ def check_for_unused_hw_axes_in_insns(kernel):
 
 
 def check_for_double_use_of_hw_axes(kernel):
-    from loopy.kernel import UniqueTag
+    from loopy.kernel.data import UniqueTag
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
@@ -117,7 +117,7 @@ class WriteRaceConditionError(RuntimeError):
 
 def check_for_write_races(kernel):
     from loopy.symbolic import DependencyMapper
-    from loopy.kernel import ParallelTag, GroupIndexTag, LocalIndexTagBase
+    from loopy.kernel.data import ParallelTag, GroupIndexTag, LocalIndexTagBase
     depmap = DependencyMapper()
 
     iname_to_tag = kernel.iname_to_tag.get
@@ -186,7 +186,7 @@ def check_for_write_races(kernel):
                     % (insn.id, ",".join(race_inames)))
 
 def check_for_orphaned_user_hardware_axes(kernel):
-    from loopy.kernel import LocalIndexTag
+    from loopy.kernel.data import LocalIndexTag
     for axis in kernel.local_sizes:
         found = False
         for tag in kernel.iname_to_tag.itervalues():
@@ -199,7 +199,7 @@ def check_for_orphaned_user_hardware_axes(kernel):
                     "has no iname mapped to it" % axis)
 
 def check_for_data_dependent_parallel_bounds(kernel):
-    from loopy.kernel import ParallelTag
+    from loopy.kernel.data import ParallelTag
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
@@ -482,7 +482,7 @@ def get_problems(kernel, parameters):
             msg(4, "using more local memory than available--"
                     "possibly OK due to cache nature")
 
-    from loopy.kernel import ConstantArg
+    from loopy.kernel.data import ConstantArg
     const_arg_count = sum(
             1 for arg in kernel.args
             if isinstance(arg, ConstantArg))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index b69cf501c56bd4ac3af1328d556e797b536eb94a..5b43de7814779f9b02f5fe84517ac3655746899e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -26,7 +26,6 @@ THE SOFTWARE.
 
 
 from pytools import Record
-import numpy as np
 import islpy as isl
 
 
@@ -175,7 +174,7 @@ def make_initial_assignments(kernel):
 
     global_size, local_size = kernel.get_grid_sizes()
 
-    from loopy.kernel import LocalIndexTag, GroupIndexTag
+    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
     from pymbolic import var
 
     for iname in kernel.all_inames():
@@ -253,7 +252,7 @@ def generate_code(kernel, with_annotation=False,
 
     has_image = False
 
-    from loopy.kernel import GlobalArg, ConstantArg, ImageArg, ValueArg
+    from loopy.kernel.data import GlobalArg, ConstantArg, ImageArg, ValueArg
 
     args = []
     for arg in kernel.args:
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 71cd536cf2ed068d2cb7c25f8bfc7152028565c6..51c0dca36b4159670e0b04ab7cd02f7e5275dcd5 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -75,7 +75,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
 
 def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import EnterLoop, LeaveLoop
-    from loopy.kernel import ParallelTag, LocalIndexTagBase
+    from loopy.kernel.data import ParallelTag, LocalIndexTagBase
 
     result = set()
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index a0a2a29b7f8446e36d3d8b4a27a7d61aeb83242e..fa42edf88ab6c42681a3dba494b6d15ed1cfe3e6 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
     inames if there is a barrier nested somewhere within.
     """
 
-    from loopy.kernel import LocalIndexTag, HardwareParallelTag
+    from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
     result = find_active_inames_at(kernel, sched_index)
@@ -69,7 +69,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
                 generate_unroll_loop,
                 generate_sequential_loop_dim_code)
 
-        from loopy.kernel import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
+        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
                 LoopedIlpTag)
         if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
             func = generate_unroll_loop
@@ -117,7 +117,7 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames):
 
     tag_key_uses = {}
 
-    from loopy.kernel import HardwareParallelTag
+    from loopy.kernel.data import HardwareParallelTag
 
     for iname in cond_inames:
         tag = kernel.iname_to_tag.get(iname)
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index 5772bf7bcdf01245e496fca28e820c8457dcfc22..902ed7ef2aafc6e94df3dec0103d51fc5685a181 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -319,8 +319,8 @@ class LoopyCCodeMapper(RecursiveMapper):
                     enclosing_prec, type_context))
         elif expr.name in self.kernel.arg_dict:
             arg = self.kernel.arg_dict[expr.name]
-            from loopy.kernel import _ShapedArg
-            if isinstance(arg, _ShapedArg) and arg.shape == ():
+            from loopy.kernel.data import ShapedArg
+            if isinstance(arg, ShapedArg) and arg.shape == ():
                 return "*"+expr.name
 
         for mangler in self.kernel.symbol_manglers:
@@ -354,7 +354,7 @@ class LoopyCCodeMapper(RecursiveMapper):
         if expr.aggregate.name in self.kernel.arg_dict:
             arg = self.kernel.arg_dict[expr.aggregate.name]
 
-            from loopy.kernel import ImageArg
+            from loopy.kernel.data import ImageArg
             if isinstance(arg, ImageArg):
                 assert isinstance(expr.index, tuple)
 
@@ -429,7 +429,7 @@ class LoopyCCodeMapper(RecursiveMapper):
         if expr.aggregate.name in self.kernel.arg_dict:
             arg = self.kernel.arg_dict[expr.aggregate.name]
 
-            from loopy.kernel import ImageArg
+            from loopy.kernel.data import ImageArg
             if isinstance(arg, ImageArg):
                 raise RuntimeError("linear indexing doesn't work on images: %s"
                         % expr)
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 8c4502123bb9398fd65b575f8f782c5e93c72fed..37616322ecdef2c734e8365a49c98d6dd5f81031 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -156,7 +156,7 @@ def intersect_kernel_with_slab(kernel, slab, iname):
 # {{{ hw-parallel loop
 
 def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
-    from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag
+    from loopy.kernel.data import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag
 
     if hw_inames_left is None:
         hw_inames_left = [iname
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 84c525fd79449b1e6ab93791935e4f9c83d7a18f..2e17f4e7408e5da47e1861449c6b41d43acbe26b 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -139,7 +139,7 @@ class CompiledKernel:
     def get_kernel(self, dtype_mapping_set):
         kernel = self.kernel
 
-        from loopy.kernel import (
+        from loopy.kernel.tools import (
                 add_argument_dtypes,
                 infer_argument_dtypes,
                 get_arguments_with_incomplete_dtype)
@@ -185,7 +185,7 @@ class CompiledKernel:
             print "[Loopy] ----------------------------------------------------"
             raise
 
-        from loopy.kernel import ValueArg
+        from loopy.kernel.data import ValueArg
 
         arg_types = []
         for arg in kernel.args:
@@ -374,7 +374,7 @@ class TestArgInfo(Record):
 
 def make_ref_args(kernel, queue, parameters,
         fill_value):
-    from loopy.kernel import ValueArg, GlobalArg, ImageArg
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
 
     from pymbolic import evaluate
 
@@ -468,7 +468,7 @@ def make_ref_args(kernel, queue, parameters,
 
 def make_args(queue, kernel, arg_descriptors, parameters,
         fill_value):
-    from loopy.kernel import ValueArg, GlobalArg, ImageArg
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
 
     from pymbolic import evaluate
 
diff --git a/loopy/creation.py b/loopy/creation.py
deleted file mode 100644
index 325a56f8fd6dcc96e8d9ac89d9c66da2b7e65d68..0000000000000000000000000000000000000000
--- a/loopy/creation.py
+++ /dev/null
@@ -1,314 +0,0 @@
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-
-
-import numpy as np
-from loopy.symbolic import IdentityMapper
-
-
-def tag_reduction_inames_as_sequential(knl):
-    result = set()
-
-    def map_reduction(red_expr, rec):
-        rec(red_expr.expr)
-        result.update(red_expr.inames)
-
-    from loopy.symbolic import ReductionCallbackMapper
-    for insn in knl.instructions:
-        ReductionCallbackMapper(map_reduction)(insn.expression)
-
-    from loopy.kernel import ParallelTag, ForceSequentialTag
-
-    new_iname_to_tag = {}
-    for iname in result:
-        tag = knl.iname_to_tag.get(iname)
-        if tag is not None and isinstance(tag, ParallelTag):
-            raise RuntimeError("inconsistency detected: "
-                    "reduction iname '%s' has "
-                    "a parallel tag" % iname)
-
-        if tag is None:
-            new_iname_to_tag[iname] = ForceSequentialTag()
-
-    from loopy import tag_inames
-    return tag_inames(knl, new_iname_to_tag)
-
-# {{{ sanity checking
-
-def check_for_duplicate_names(knl):
-    name_to_source = {}
-
-    def add_name(name, source):
-        if name in name_to_source:
-            raise RuntimeError("invalid %s name '%s'--name already used as "
-                    "%s" % (source, name, name_to_source[name]))
-
-        name_to_source[name] = source
-
-    for name in knl.all_inames():
-        add_name(name, "iname")
-    for arg in knl.args:
-        add_name(arg.name, "argument")
-    for name in knl.temporary_variables:
-        add_name(name, "temporary")
-    for name in knl.substitutions:
-        add_name(name, "substitution")
-
-def check_for_nonexistent_iname_deps(knl):
-    for insn in knl.instructions:
-        if not set(insn.forced_iname_deps) <= knl.all_inames():
-            raise ValueError("In instruction '%s': "
-                    "cannot force dependency on inames '%s'--"
-                    "they don't exist" % (
-                        insn.id,
-                        ",".join(
-                            set(insn.forced_iname_deps)-knl.all_inames())))
-
-def check_for_multiple_writes_to_loop_bounds(knl):
-    from islpy import dim_type
-
-    domain_parameters = set()
-    for dom in knl.domains:
-        domain_parameters.update(dom.get_space().get_var_dict(dim_type.param))
-
-    temp_var_domain_parameters = domain_parameters & set(
-            knl.temporary_variables)
-
-    wmap = knl.writer_map()
-    for tvpar in temp_var_domain_parameters:
-        par_writers = wmap[tvpar]
-        if len(par_writers) != 1:
-            raise RuntimeError("there must be exactly one write to data-dependent "
-                    "domain parameter '%s' (found %d)" % (tvpar, len(par_writers)))
-
-
-def check_written_variable_names(knl):
-    admissible_vars = (
-            set(arg.name for arg in knl.args)
-            | set(knl.temporary_variables.iterkeys()))
-
-    for insn in knl.instructions:
-        var_name = insn.get_assignee_var_name()
-
-        if var_name not in admissible_vars:
-            raise RuntimeError("variable '%s' not declared or not "
-                    "allowed for writing" % var_name)
-
-# }}}
-
-# {{{ expand common subexpressions into assignments
-
-class CSEToAssignmentMapper(IdentityMapper):
-    def __init__(self, add_assignment):
-        self.add_assignment = add_assignment
-        self.expr_to_var = {}
-
-    def map_common_subexpression(self, expr):
-        try:
-            return self.expr_to_var[expr.child]
-        except KeyError:
-            from loopy.symbolic import TypedCSE
-            if isinstance(expr, TypedCSE):
-                dtype = expr.dtype
-            else:
-                dtype = None
-
-            child = self.rec(expr.child)
-            from pymbolic.primitives import Variable
-            if isinstance(child, Variable):
-                return child
-
-            var_name = self.add_assignment(expr.prefix, child, dtype)
-            var = Variable(var_name)
-            self.expr_to_var[expr.child] = var
-            return var
-
-def expand_cses(knl):
-    def add_assignment(base_name, expr, dtype):
-        if base_name is None:
-            base_name = "var"
-
-        new_var_name = var_name_gen(base_name)
-
-        if dtype is None:
-            from loopy import infer_type
-            dtype = infer_type
-        else:
-            dtype=np.dtype(dtype)
-
-        from loopy.kernel import TemporaryVariable
-        new_temp_vars[new_var_name] = TemporaryVariable(
-                name=new_var_name,
-                dtype=dtype,
-                is_local=None,
-                shape=())
-
-        from pymbolic.primitives import Variable
-        from loopy.kernel import Instruction
-        insn = Instruction(
-                id=knl.make_unique_instruction_id(extra_used_ids=newly_created_insn_ids),
-                assignee=Variable(new_var_name), expression=expr)
-        newly_created_insn_ids.add(insn.id)
-        new_insns.append(insn)
-
-        return new_var_name
-
-    cseam = CSEToAssignmentMapper(add_assignment=add_assignment)
-
-    new_insns = []
-
-    var_name_gen = knl.get_var_name_generator()
-
-    newly_created_insn_ids = set()
-    new_temp_vars = knl.temporary_variables.copy()
-
-    for insn in knl.instructions:
-        new_insns.append(insn.copy(expression=cseam(insn.expression)))
-
-    return knl.copy(
-            instructions=new_insns,
-            temporary_variables=new_temp_vars)
-
-# }}}
-
-# {{{ temporary variable creation
-
-def create_temporaries(knl):
-    new_insns = []
-    new_temp_vars = knl.temporary_variables.copy()
-
-    for insn in knl.instructions:
-        from loopy.kernel import TemporaryVariable
-
-        if insn.temp_var_type is not None:
-            assignee_name = insn.get_assignee_var_name()
-
-            assignee_indices = []
-            from pymbolic.primitives import Variable
-            for index_expr in insn.get_assignee_indices():
-                if (not isinstance(index_expr, Variable)
-                        or not index_expr.name in knl.all_inames()):
-                    raise RuntimeError(
-                            "only plain inames are allowed in "
-                            "the lvalue index when declaring the "
-                            "variable '%s' in an instruction"
-                            % assignee_name)
-
-                assignee_indices.append(index_expr.name)
-
-            base_indices, shape = \
-                    knl.find_var_base_indices_and_shape_from_inames(
-                            assignee_indices, knl.cache_manager)
-
-            if assignee_name in new_temp_vars:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists" % assignee_name)
-            if assignee_name in knl.arg_dict:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists as argument" % assignee_name)
-
-            new_temp_vars[assignee_name] = TemporaryVariable(
-                    name=assignee_name,
-                    dtype=insn.temp_var_type,
-                    is_local=None,
-                    base_indices=base_indices,
-                    shape=shape)
-
-            insn = insn.copy(temp_var_type=None)
-
-        new_insns.append(insn)
-
-    return knl.copy(
-            instructions=new_insns,
-            temporary_variables=new_temp_vars)
-
-# }}}
-
-# {{{ check for reduction iname duplication
-
-def check_for_reduction_inames_duplication_requests(kernel):
-
-    # {{{ helper function
-
-    def check_reduction_inames(reduction_expr, rec):
-        for iname in reduction_expr.inames:
-            if iname.startswith("@"):
-                raise RuntimeError("Reduction iname duplication with '@' is no "
-                        "longer supported. Use loopy.duplicate_inames instead.")
-
-    # }}}
-
-
-    from loopy.symbolic import ReductionCallbackMapper
-    rcm = ReductionCallbackMapper(check_reduction_inames)
-    for insn in kernel.instructions:
-        rcm(insn.expression)
-
-    for sub_name, sub_rule in kernel.substitutions.iteritems():
-        rcm(sub_rule.expression)
-
-# }}}
-
-# {{{ kernel creation top-level
-
-def make_kernel(*args, **kwargs):
-    """Second pass of kernel creation. Think about requests for iname duplication
-    and temporary variable creation.
-    """
-
-    from loopy.kernel import LoopKernel
-    knl = LoopKernel(*args, **kwargs)
-
-    from loopy import tag_inames
-    knl = tag_inames(
-            knl.copy(iname_to_tag_requests=None),
-            knl.iname_to_tag_requests).copy(
-                    iname_to_tag_requests=[])
-
-    check_for_nonexistent_iname_deps(knl)
-    check_for_reduction_inames_duplication_requests(knl)
-
-
-    knl = tag_reduction_inames_as_sequential(knl)
-    knl = create_temporaries(knl)
-    knl = expand_cses(knl)
-
-    # -------------------------------------------------------------------------
-    # Ordering dependency:
-    # -------------------------------------------------------------------------
-    # Must create temporary before checking for writes to temporary variables
-    # that are domain parameters.
-    # -------------------------------------------------------------------------
-
-    check_for_multiple_writes_to_loop_bounds(knl)
-    check_for_duplicate_names(knl)
-    check_written_variable_names(knl)
-
-    return knl
-
-# }}}
-
-# vim: fdm=marker
diff --git a/loopy/cse.py b/loopy/cse.py
index fd0d150de7133ee7117679168b5997d46058fa43..77594b4124ef56984db87c78abc287ab3a598f87 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -674,7 +674,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     c_subst_name = subst_name.replace(".", "_")
 
-    from loopy.kernel import parse_tag
+    from loopy.kernel.data import parse_tag
     default_tag = parse_tag(default_tag)
 
     subst = kernel.substitutions[subst_name]
@@ -786,7 +786,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # {{{ find domain to be changed
 
-    from loopy.kernel import DomainChanger
+    from loopy.kernel.tools import DomainChanger
     domch = DomainChanger(kernel, expanding_inames)
 
     if domch.leaf_domain_index is not None:
@@ -836,7 +836,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             )))
         (compute_expr))
 
-    from loopy.kernel import Instruction
+    from loopy.kernel.data import Instruction
     compute_insn = Instruction(
             id=kernel.make_unique_instruction_id(based_on=c_subst_name),
             assignee=assignee,
@@ -864,7 +864,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     else:
         dtype = np.dtype(dtype)
 
-    from loopy.kernel import TemporaryVariable
+    from loopy.kernel.data import TemporaryVariable
 
     new_temporary_variables = kernel.temporary_variables.copy()
     temp_var = TemporaryVariable(
diff --git a/loopy/kernel.py b/loopy/kernel.py
deleted file mode 100644
index f0c799e5d38fe36af0ab88855de79d7b4865c3c2..0000000000000000000000000000000000000000
--- a/loopy/kernel.py
+++ /dev/null
@@ -1,1973 +0,0 @@
-"""Elements of loopy's user-facing language."""
-
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-
-
-
-import numpy as np
-from pytools import Record, memoize_method
-import islpy as isl
-from islpy import dim_type
-
-import re
-
-
-
-
-class CannotBranchDomainTree(RuntimeError):
-    pass
-
-# {{{ index tags
-
-class IndexTag(Record):
-    __slots__ = []
-
-    def __hash__(self):
-        raise RuntimeError("use .key to hash index tags")
-
-
-
-
-class ParallelTag(IndexTag):
-    pass
-
-class HardwareParallelTag(ParallelTag):
-    pass
-
-class UniqueTag(IndexTag):
-    @property
-    def key(self):
-        return type(self)
-
-class AxisTag(UniqueTag):
-    __slots__ = ["axis"]
-
-    def __init__(self, axis):
-        Record.__init__(self,
-                axis=axis)
-
-    @property
-    def key(self):
-        return (type(self), self.axis)
-
-    def __str__(self):
-        return "%s.%d" % (
-                self.print_name, self.axis)
-
-class GroupIndexTag(HardwareParallelTag, AxisTag):
-    print_name = "g"
-
-class LocalIndexTagBase(HardwareParallelTag):
-    pass
-
-class LocalIndexTag(LocalIndexTagBase, AxisTag):
-    print_name = "l"
-
-class AutoLocalIndexTagBase(LocalIndexTagBase):
-    pass
-
-class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
-    def __str__(self):
-        return "l.auto"
-
-class IlpBaseTag(ParallelTag):
-    pass
-
-class UnrolledIlpTag(IlpBaseTag):
-    def __str__(self):
-        return "ilp.unr"
-
-class LoopedIlpTag(IlpBaseTag):
-    def __str__(self):
-        return "ilp.seq"
-
-class UnrollTag(IndexTag):
-    def __str__(self):
-        return "unr"
-
-class ForceSequentialTag(IndexTag):
-    def __str__(self):
-        return "forceseq"
-
-def parse_tag(tag):
-    if tag is None:
-        return tag
-
-    if isinstance(tag, IndexTag):
-        return tag
-
-    if not isinstance(tag, str):
-        raise ValueError("cannot parse tag: %s" % tag)
-
-    if tag == "for":
-        return None
-    elif tag in ["unr"]:
-        return UnrollTag()
-    elif tag in ["ilp", "ilp.unr"]:
-        return UnrolledIlpTag()
-    elif tag == "ilp.seq":
-        return LoopedIlpTag()
-    elif tag.startswith("g."):
-        return GroupIndexTag(int(tag[2:]))
-    elif tag.startswith("l."):
-        axis = tag[2:]
-        if axis == "auto":
-            return AutoFitLocalIndexTag()
-        else:
-            return LocalIndexTag(int(axis))
-    else:
-        raise ValueError("cannot parse tag: %s" % tag)
-
-# }}}
-
-# {{{ arguments
-
-class _ShapedArg(Record):
-    def __init__(self, name, dtype=None, shape=None, strides=None, order="C",
-            offset=0):
-        """
-        All of the following are optional. Specify either strides or shape.
-
-        :arg shape:
-        :arg strides: like numpy strides, but in multiples of
-            data type size
-        :arg order:
-        :arg offset: Offset from the beginning of the vector from which
-            the strides are counted.
-        """
-        if dtype is not None:
-            dtype = np.dtype(dtype)
-
-        def parse_if_necessary(x):
-            if isinstance(x, str):
-                from pymbolic import parse
-                return parse(x)
-            else:
-                return x
-
-        def process_tuple(x):
-            x = parse_if_necessary(x)
-            if not isinstance(x, tuple):
-                x = (x,)
-
-            return tuple(parse_if_necessary(xi) for xi in x)
-
-        if strides is not None:
-            strides = process_tuple(strides)
-
-        if shape is not None:
-            shape = process_tuple(shape)
-
-        if strides is None and shape is not None:
-            from pyopencl.compyte.array import (
-                    f_contiguous_strides,
-                    c_contiguous_strides)
-
-            if order == "F":
-                strides = f_contiguous_strides(1, shape)
-            elif order == "C":
-                strides = c_contiguous_strides(1, shape)
-            else:
-                raise ValueError("invalid order: %s" % order)
-
-        Record.__init__(self,
-                name=name,
-                dtype=dtype,
-                strides=strides,
-                offset=offset,
-                shape=shape)
-
-    @property
-    @memoize_method
-    def numpy_strides(self):
-        return tuple(self.dtype.itemsize*s for s in self.strides)
-
-    @property
-    def dimensions(self):
-        return len(self.shape)
-
-class GlobalArg(_ShapedArg):
-    def __repr__(self):
-        return "<GlobalArg '%s' of type %s and shape (%s)>" % (
-                self.name, self.dtype, ",".join(str(i) for i in self.shape))
-
-class ArrayArg(GlobalArg):
-    def __init__(self, *args, **kwargs):
-        from warnings import warn
-        warn("ArrayArg is a deprecated name of GlobalArg", DeprecationWarning,
-                stacklevel=2)
-        GlobalArg.__init__(self, *args, **kwargs)
-
-class ConstantArg(_ShapedArg):
-    def __repr__(self):
-        return "<ConstantArg '%s' of type %s and shape (%s)>" % (
-                self.name, self.dtype, ",".join(str(i) for i in self.shape))
-
-class ImageArg(Record):
-    def __init__(self, name, dtype=None, dimensions=None, shape=None):
-        dtype = np.dtype(dtype)
-        if shape is not None:
-            if dimensions is not None and dimensions != len(shape):
-                raise RuntimeError("cannot specify both shape and "
-                        "disagreeing dimensions in ImageArg")
-            dimensions = len(shape)
-        else:
-            if not isinstance(dimensions, int):
-                raise RuntimeError("ImageArg: dimensions must be an integer")
-
-        Record.__init__(self,
-                dimensions=dimensions,
-                shape=shape,
-                dtype=dtype,
-                name=name)
-
-
-    def __repr__(self):
-        return "<ImageArg '%s' of type %s>" % (self.name, self.dtype)
-
-class ValueArg(Record):
-    def __init__(self, name, dtype=None, approximately=None):
-        if dtype is not None:
-            dtype = np.dtype(dtype)
-
-        Record.__init__(self, name=name, dtype=dtype,
-                approximately=approximately)
-
-    def __repr__(self):
-        return "<ValueArg '%s' of type %s>" % (self.name, self.dtype)
-
-class ScalarArg(ValueArg):
-    def __init__(self, name, dtype=None, approximately=None):
-        from warnings import warn
-        warn("ScalarArg is a deprecated name of ValueArg",
-                DeprecationWarning, stacklevel=2)
-
-        ValueArg.__init__(self, name, dtype, approximately)
-
-# }}}
-
-# {{{ temporary variable
-
-class TemporaryVariable(Record):
-    """
-    :ivar name:
-    :ivar dtype:
-    :ivar shape:
-    :ivar storage_shape:
-    :ivar base_indices:
-    :ivar is_local:
-    """
-
-    def __init__(self, name, dtype, shape, is_local, base_indices=None,
-            storage_shape=None):
-        if base_indices is None:
-            base_indices = (0,) * len(shape)
-
-        if shape is not None and not isinstance(shape, tuple):
-            shape = tuple(shape)
-
-        Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local,
-                base_indices=base_indices,
-                storage_shape=storage_shape)
-
-    @property
-    def nbytes(self):
-        from pytools import product
-        return product(si for si in self.shape)*self.dtype.itemsize
-
-# }}}
-
-# {{{ subsitution rule
-
-class SubstitutionRule(Record):
-    """
-    :ivar name:
-    :ivar arguments:
-    :ivar expression:
-    """
-
-    def __init__(self, name, arguments, expression):
-        assert isinstance(arguments, tuple)
-
-        Record.__init__(self,
-                name=name, arguments=arguments, expression=expression)
-
-    def __str__(self):
-        return "%s(%s) := %s" % (
-                self.name, ", ".join(self.arguments), self.expression)
-
-# }}}
-
-# {{{ instruction
-
-class Instruction(Record):
-    """
-    :ivar id: An (otherwise meaningless) identifier that is unique within
-        a :class:`LoopKernel`.
-    :ivar assignee:
-    :ivar expression:
-    :ivar forced_iname_deps: a set of inames that are added to the list of iname
-        dependencies
-    :ivar insn_deps: a list of ids of :class:`Instruction` instances that
-        *must* be executed before this one. Note that loop scheduling augments this
-        by adding dependencies on any writes to temporaries read by this instruction.
-    :ivar boostable: Whether the instruction may safely be executed
-        inside more loops than advertised without changing the meaning
-        of the program. Allowed values are *None* (for unknown), *True*, and *False*.
-    :ivar boostable_into: a set of inames into which the instruction
-        may need to be boosted, as a heuristic help for the scheduler.
-    :ivar priority: scheduling priority
-
-    The following two instance variables are only used until :func:`loopy.make_kernel` is
-    finished:
-
-    :ivar temp_var_type: if not None, a type that will be assigned to the new temporary variable
-        created from the assignee
-    """
-    def __init__(self,
-            id, assignee, expression,
-            forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
-            boostable_into=None,
-            temp_var_type=None, priority=0):
-
-        from loopy.symbolic import parse
-        if isinstance(assignee, str):
-            assignee = parse(assignee)
-        if isinstance(expression, str):
-            assignee = parse(expression)
-
-        assert isinstance(forced_iname_deps, frozenset)
-        assert isinstance(insn_deps, set)
-
-        Record.__init__(self,
-                id=id, assignee=assignee, expression=expression,
-                forced_iname_deps=forced_iname_deps,
-                insn_deps=insn_deps, boostable=boostable,
-                boostable_into=boostable_into,
-                temp_var_type=temp_var_type,
-                priority=priority)
-
-    @memoize_method
-    def reduction_inames(self):
-        def map_reduction(expr, rec):
-            rec(expr.expr)
-            for iname in expr.inames:
-                result.add(iname)
-
-        from loopy.symbolic import ReductionCallbackMapper
-        cb_mapper = ReductionCallbackMapper(map_reduction)
-
-        result = set()
-        cb_mapper(self.expression)
-
-        return result
-
-    def __str__(self):
-        result = "%s: %s <- %s" % (self.id,
-                self.assignee, self.expression)
-
-        if self.boostable == True:
-            if self.boostable_into:
-                result += " (boostable into '%s')" % ",".join(self.boostable_into)
-            else:
-                result += " (boostable)"
-        elif self.boostable == False:
-            result += " (not boostable)"
-        elif self.boostable is None:
-            pass
-        else:
-            raise RuntimeError("unexpected value for Instruction.boostable")
-
-        options = []
-
-        if self.insn_deps:
-            options.append("deps="+":".join(self.insn_deps))
-        if self.priority:
-            options.append("priority=%d" % self.priority)
-
-        return result
-
-    @memoize_method
-    def get_assignee_var_name(self):
-        from pymbolic.primitives import Variable, Subscript
-
-        if isinstance(self.assignee, Variable):
-            var_name = self.assignee.name
-        elif isinstance(self.assignee, Subscript):
-            agg = self.assignee.aggregate
-            assert isinstance(agg, Variable)
-            var_name = agg.name
-        else:
-            raise RuntimeError("invalid lvalue '%s'" % self.assignee)
-
-        return var_name
-
-    @memoize_method
-    def get_assignee_indices(self):
-        from pymbolic.primitives import Variable, Subscript
-
-        if isinstance(self.assignee, Variable):
-            return ()
-        elif isinstance(self.assignee, Subscript):
-            result = self.assignee.index
-            if not isinstance(result, tuple):
-                result = (result,)
-            return result
-        else:
-            raise RuntimeError("invalid lvalue '%s'" % self.assignee)
-
-    @memoize_method
-    def get_read_var_names(self):
-        from loopy.symbolic import get_dependencies
-        return get_dependencies(self.expression)
-
-# }}}
-
-# {{{ expand defines
-
-WORD_RE = re.compile(r"\b([a-zA-Z0-9_]+)\b")
-BRACE_RE = re.compile(r"\$\{([a-zA-Z0-9_]+)\}")
-
-def expand_defines(insn, defines, single_valued=True):
-    replacements = [()]
-
-    for find_regexp, replace_pattern in [
-            (BRACE_RE, r"\$\{%s\}"),
-            (WORD_RE, r"\b%s\b"),
-            ]:
-
-        for match in find_regexp.finditer(insn):
-            word = match.group(1)
-
-            try:
-                value = defines[word]
-            except KeyError:
-                continue
-
-            if isinstance(value, list):
-                if single_valued:
-                    raise ValueError("multi-valued macro expansion not allowed "
-                            "in this context (when expanding '%s')" % word)
-
-                replacements = [
-                        rep+((replace_pattern % word, subval),)
-                        for rep in replacements
-                        for subval in value
-                        ]
-            else:
-                replacements = [
-                        rep+((replace_pattern % word, value),)
-                        for rep in replacements]
-
-    for rep in replacements:
-        rep_value = insn
-        for pattern, val in rep:
-            rep_value = re.sub(pattern, str(val), rep_value)
-
-        yield rep_value
-
-def expand_defines_in_expr(expr, defines):
-    from pymbolic.primitives import Variable
-    from loopy.symbolic import parse
-
-    def subst_func(var):
-        if isinstance(var, Variable):
-            try:
-                var_value = defines[var.name]
-            except KeyError:
-                return None
-            else:
-                return parse(str(var_value))
-        else:
-            return None
-
-    from loopy.symbolic import SubstitutionMapper
-    return SubstitutionMapper(subst_func)(expr)
-
-# }}}
-
-# {{{ function manglers / dtype getters
-
-def default_function_mangler(name, arg_dtypes):
-    from loopy.reduction import reduction_function_mangler
-
-    manglers = [reduction_function_mangler]
-    for mangler in manglers:
-        result = mangler(name, arg_dtypes)
-        if result is not None:
-            return result
-
-    return None
-
-def opencl_function_mangler(name, arg_dtypes):
-    if name == "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
-
-    if len(arg_dtypes) == 1:
-        arg_dtype, = arg_dtypes
-
-        if arg_dtype.kind == "c":
-            if arg_dtype == np.complex64:
-                tpname = "cfloat"
-            elif arg_dtype == np.complex128:
-                tpname = "cdouble"
-            else:
-                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
-
-            if name in ["sqrt", "exp", "log",
-                    "sin", "cos", "tan",
-                    "sinh", "cosh", "tanh"]:
-                return arg_dtype, "%s_%s" % (tpname, name)
-
-            if name in ["real", "imag"]:
-                return np.dtype(arg_dtype.type(0).real), "%s_%s" % (tpname, name)
-
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["s0"]
-        return scalar_dtype, name
-
-    return None
-
-def single_arg_function_mangler(name, arg_dtypes):
-    if len(arg_dtypes) == 1:
-        dtype, = arg_dtypes
-        return dtype, name
-
-    return None
-
-def opencl_symbol_mangler(name):
-    # FIXME: should be more picky about exact names
-    if name.startswith("FLT_"):
-        return np.dtype(np.float32), name
-    elif name.startswith("DBL_"):
-        return np.dtype(np.float64), name
-    elif name.startswith("M_"):
-        if name.endswith("_F"):
-            return np.dtype(np.float32), name
-        else:
-            return np.dtype(np.float64), name
-    else:
-        return None
-
-# }}}
-
-# {{{ preamble generators
-
-def default_preamble_generator(seen_dtypes, seen_functions):
-    from loopy.reduction import reduction_preamble_generator
-
-    for result in reduction_preamble_generator(seen_dtypes, seen_functions):
-        yield result
-
-    has_double = False
-    has_complex = False
-
-    for dtype in seen_dtypes:
-        if dtype in [np.float64, np.complex128]:
-            has_double = True
-        if dtype.kind == "c":
-            has_complex = True
-
-    if has_double:
-        yield ("00_enable_double", """
-            #pragma OPENCL EXTENSION cl_khr_fp64: enable
-            """)
-
-    if has_complex:
-        if has_double:
-            yield ("10_include_complex_header", """
-                #define PYOPENCL_DEFINE_CDOUBLE
-
-                #include <pyopencl-complex.h>
-                """)
-        else:
-            yield ("10_include_complex_header", """
-                #include <pyopencl-complex.h>
-                """)
-
-    c_funcs = set(c_name for name, c_name, arg_dtypes in seen_functions)
-    if "int_floor_div" in c_funcs:
-        yield ("05_int_floor_div", """
-            #define int_floor_div(a,b) \
-              (( (a) - \
-                 ( ( (a)<0 ) != ( (b)<0 )) \
-                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
-               / (b) )
-            """)
-
-    if "int_floor_div_pos_b" in c_funcs:
-        yield ("05_int_floor_div_pos_b", """
-            #define int_floor_div_pos_b(a,b) ( \
-                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
-                )
-            """)
-
-
-# }}}
-
-# {{{ loop kernel object
-
-def _generate_unique_possibilities(prefix):
-    yield prefix
-
-    try_num = 0
-    while True:
-        yield "%s_%d" % (prefix, try_num)
-        try_num += 1
-
-class _UniqueNameGenerator:
-    def __init__(self, existing_names):
-        self.existing_names = existing_names.copy()
-
-    def is_name_conflicting(self, name):
-        return name in self.existing_names
-
-    def add_name(self, name):
-        if self.is_name_conflicting(name):
-            raise ValueError("name '%s' conflicts with existing names")
-        self.existing_names.add(name)
-
-    def add_names(self, names):
-        for name in names:
-            self.add_name(name)
-
-    def __call__(self, based_on="var"):
-        for var_name in _generate_unique_possibilities(based_on):
-            if not self.is_name_conflicting(var_name):
-                break
-
-        self.existing_names.add(var_name)
-        return var_name
-
-_IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")
-
-def _gather_identifiers(s):
-    return set(_IDENTIFIER_RE.findall(s))
-
-def _parse_domains(ctx, args_and_vars, domains, defines):
-    result = []
-    available_parameters = args_and_vars.copy()
-    used_inames = set()
-
-    for dom in domains:
-        if isinstance(dom, str):
-            dom, = expand_defines(dom, defines)
-
-            if not dom.lstrip().startswith("["):
-                # i.e. if no parameters are already given
-                ids = _gather_identifiers(dom)
-                parameters = ids & available_parameters
-                dom = "[%s] -> %s" % (",".join(parameters), dom)
-
-            try:
-                dom = isl.BasicSet.read_from_str(ctx, dom)
-            except:
-                print "failed to parse domain '%s'" % dom
-                raise
-        else:
-            assert isinstance(dom, (isl.Set, isl.BasicSet))
-            # assert dom.get_ctx() == ctx
-
-        for i_iname in xrange(dom.dim(dim_type.set)):
-            iname = dom.get_dim_name(dim_type.set, i_iname)
-
-            if iname is None:
-                raise RuntimeError("domain '%s' provided no iname at index "
-                        "%d (redefined iname?)" % (dom, i_iname))
-
-            if iname in used_inames:
-                raise RuntimeError("domain '%s' redefines iname '%s' "
-                        "that is part of a previous domain" % (dom, iname))
-
-            used_inames.add(iname)
-            available_parameters.add(iname)
-
-        result.append(dom)
-
-    return result
-
-
-
-
-class LoopKernel(Record):
-    """
-    :ivar device: :class:`pyopencl.Device`
-    :ivar domains: :class:`islpy.BasicSet`
-    :ivar instructions:
-    :ivar args:
-    :ivar schedule:
-    :ivar name:
-    :ivar preambles: a list of (tag, code) tuples that identify preamble snippets.
-        Each tag's snippet is only included once, at its first occurrence.
-        The preambles will be inserted in order of their tags.
-    :ivar preamble_generators: a list of functions of signature
-        (seen_dtypes, seen_functions) where seen_functions is a set of
-        (name, c_name, arg_dtypes), generating extra entries for `preambles`.
-    :ivar assumptions: the initial implemented_domain, captures assumptions
-        on the parameters. (an isl.Set)
-    :ivar local_sizes: A dictionary from integers to integers, mapping
-        workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be
-        length 16.
-    :ivar temporary_variables:
-    :ivar iname_to_tag:
-    :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
-        objects
-    :ivar function_manglers: list of functions of signature (name, arg_dtypes)
-        returning a tuple (result_dtype, c_name)
-        or a tuple (result_dtype, c_name, arg_dtypes),
-        where c_name is the C-level function to be called.
-    :ivar symbol_manglers: list of functions of signature (name) returning
-        a tuple (result_dtype, c_name), where c_name is the C-level symbol to be
-        evaluated.
-    :ivar defines: a dictionary of replacements to be made in instructions given
-        as strings before parsing. A macro instance intended to be replaced should
-        look like "MACRO" in the instruction code. The expansion given in this
-        parameter is allowed to be a list. In this case, instructions are generated
-        for *each* combination of macro values.
-
-        These defines may also be used in the domain and in argument shapes and
-        strides. They are expanded only upon kernel creation.
-
-    The following arguments are not user-facing:
-
-    :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
-        upper_incr) tuples that will be separated out in the execution to generate
-        'bulk' slabs with fewer conditionals.
-    :ivar applied_iname_rewrites: A list of past substitution dictionaries that
-        were applied to the kernel. These are stored so that they may be repeated
-        on expressions the user specifies later.
-    :ivar cache_manager:
-    :ivar isl_context:
-
-    The following instance variables are only used until :func:`loopy.make_kernel` is
-    finished:
-
-    :ivar iname_to_tag_requests:
-    """
-
-    # {{{ constructor
-
-    def __init__(self, device, domains, instructions, args=[], schedule=None,
-            name="loopy_kernel",
-            preambles=[],
-            preamble_generators=[default_preamble_generator],
-            assumptions=None,
-            local_sizes={},
-            temporary_variables={},
-            iname_to_tag={},
-            substitutions={},
-            function_manglers=[
-                default_function_mangler,
-                opencl_function_mangler,
-                single_arg_function_mangler,
-                ],
-            symbol_manglers=[opencl_symbol_mangler],
-            defines={},
-
-            # non-user-facing
-            iname_slab_increments={},
-            applied_iname_rewrites=[],
-            cache_manager=None,
-            iname_to_tag_requests=None,
-            index_dtype=np.int32,
-            isl_context=None,
-
-            # When kernels get intersected in slab decomposition,
-            # their grid sizes shouldn't change. This provides
-            # a way to forward sub-kernel grid size requests.
-            get_grid_sizes=None):
-        """
-        :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
-            Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
-        """
-        assert not iname_to_tag_requests
-
-        import re
-
-        if cache_manager is None:
-            cache_manager = SetOperationCacheManager()
-
-        iname_to_tag_requests = {}
-
-        # {{{ parse instructions
-
-        INSN_RE = re.compile(
-                "\s*(?:\<(?P<temp_var_type>.*?)\>)?"
-                "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
-                "\s*?(?:\{(?P<options>[\s\w=,:]+)\}\s*)?$"
-                )
-        SUBST_RE = re.compile(
-                r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$"
-                )
-
-        def parse_insn(insn):
-            insn_match = INSN_RE.match(insn)
-            subst_match = SUBST_RE.match(insn)
-            if insn_match is not None and subst_match is not None:
-                raise RuntimeError("instruction parse error: %s" % insn)
-
-            if insn_match is not None:
-                groups = insn_match.groupdict()
-            elif subst_match is not None:
-                groups = subst_match.groupdict()
-            else:
-                raise RuntimeError("insn parse error")
-
-            from loopy.symbolic import parse
-            lhs = parse(groups["lhs"])
-            rhs = parse(groups["rhs"])
-
-            if insn_match is not None:
-                insn_deps = set()
-                insn_id = "insn"
-                priority = 0
-
-                if groups["options"] is not None:
-                    for option in groups["options"].split(","):
-                        option = option.strip()
-                        if not option:
-                            raise RuntimeError("empty option supplied")
-
-                        equal_idx = option.find("=")
-                        if equal_idx == -1:
-                            opt_key = option
-                            opt_value = None
-                        else:
-                            opt_key = option[:equal_idx].strip()
-                            opt_value = option[equal_idx+1:].strip()
-
-                        if opt_key == "id":
-                            insn_id = opt_value
-                        elif opt_key == "priority":
-                            priority = int(opt_value)
-                        elif opt_key == "dep":
-                            insn_deps = set(opt_value.split(":"))
-                        else:
-                            raise ValueError("unrecognized instruction option '%s'"
-                                    % opt_key)
-
-                if groups["temp_var_type"] is not None:
-                    if groups["temp_var_type"]:
-                        temp_var_type = np.dtype(groups["temp_var_type"])
-                    else:
-                        from loopy import infer_type
-                        temp_var_type = infer_type
-                else:
-                    temp_var_type = None
-
-                from pymbolic.primitives import Variable, Subscript
-                if not isinstance(lhs, (Variable, Subscript)):
-                    raise RuntimeError("left hand side of assignment '%s' must "
-                            "be variable or subscript" % lhs)
-
-                parsed_instructions.append(
-                        Instruction(
-                            id=self.make_unique_instruction_id(
-                                parsed_instructions, based_on=insn_id),
-                            insn_deps=insn_deps,
-                            forced_iname_deps=frozenset(),
-                            assignee=lhs, expression=rhs,
-                            temp_var_type=temp_var_type,
-                            priority=priority))
-
-            elif subst_match is not None:
-                from pymbolic.primitives import Variable, Call
-
-                if isinstance(lhs, Variable):
-                    subst_name = lhs.name
-                    arg_names = []
-                elif isinstance(lhs, Call):
-                    if not isinstance(lhs.function, Variable):
-                        raise RuntimeError("Invalid substitution rule left-hand side")
-                    subst_name = lhs.function.name
-                    arg_names = []
-
-                    for i, arg in enumerate(lhs.parameters):
-                        if not isinstance(arg, Variable):
-                            raise RuntimeError("Invalid substitution rule "
-                                            "left-hand side: %s--arg number %d "
-                                            "is not a variable"% (lhs, i))
-                        arg_names.append(arg.name)
-                else:
-                    raise RuntimeError("Invalid substitution rule left-hand side")
-
-                substitutions[subst_name] = SubstitutionRule(
-                        name=subst_name,
-                        arguments=tuple(arg_names),
-                        expression=rhs)
-
-        def parse_if_necessary(insn):
-            if isinstance(insn, Instruction):
-                if insn.id is None:
-                    insn = insn.copy(id=self.make_unique_instruction_id(parsed_instructions))
-                parsed_instructions.append(insn)
-                return
-
-            if not isinstance(insn, str):
-                raise TypeError("Instructions must be either an Instruction "
-                        "instance or a parseable string. got '%s' instead."
-                        % type(insn))
-
-            for insn in insn.split("\n"):
-                comment_start = insn.find("#")
-                if comment_start >= 0:
-                    insn = insn[:comment_start]
-
-                insn = insn.strip()
-                if not insn:
-                    continue
-
-                for sub_insn in expand_defines(insn, defines, single_valued=False):
-                    parse_insn(sub_insn)
-
-        parsed_instructions = []
-
-        substitutions = substitutions.copy()
-
-        if isinstance(instructions, str):
-            instructions = [instructions]
-        for insn in instructions:
-            # must construct list one-by-one to facilitate unique id generation
-            parse_if_necessary(insn)
-
-        if len(set(insn.id for insn in parsed_instructions)) != len(parsed_instructions):
-            raise RuntimeError("instruction ids do not appear to be unique")
-
-        # }}}
-
-        # Ordering dependency:
-        # Domain construction needs to know what temporary variables are
-        # available. That information can only be obtained once instructions
-        # are parsed.
-
-        # {{{ construct domains
-
-        if isinstance(domains, str):
-            domains = [domains]
-
-        for domain in domains:
-            if isinstance(domain, isl.BasicSet):
-                isl_context = domain.get_ctx()
-        if isl_context is None:
-            isl_context = isl.Context()
-
-        scalar_arg_names = set(arg.name for arg in args if isinstance(arg, ValueArg))
-        var_names = (
-                set(temporary_variables)
-                | set(insn.get_assignee_var_name()
-                    for insn in parsed_instructions
-                    if insn.temp_var_type is not None))
-        domains = _parse_domains(isl_context, scalar_arg_names | var_names, domains,
-                defines)
-
-        # }}}
-
-        # {{{ process assumptions
-
-        if assumptions is None:
-            dom0_space = domains[0].get_space()
-            assumptions_space = isl.Space.params_alloc(
-                    dom0_space.get_ctx(), dom0_space.dim(dim_type.param))
-            for i in xrange(dom0_space.dim(dim_type.param)):
-                assumptions_space = assumptions_space.set_dim_name(
-                        dim_type.param, i, dom0_space.get_dim_name(dim_type.param, i))
-            assumptions = isl.BasicSet.universe(assumptions_space)
-
-        elif isinstance(assumptions, str):
-            all_inames = set()
-            all_params = set()
-            for dom in domains:
-                all_inames.update(dom.get_var_names(dim_type.set))
-                all_params.update(dom.get_var_names(dim_type.param))
-
-            domain_parameters = all_params-all_inames
-
-            assumptions_set_str = "[%s] -> { : %s}" \
-                    % (",".join(s for s in domain_parameters),
-                        assumptions)
-            assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(),
-                    assumptions_set_str)
-
-        assert assumptions.is_params()
-
-        # }}}
-
-        # {{{ expand macros in arg shapes
-
-        processed_args = []
-        for arg in args:
-            for arg_name in arg.name.split(","):
-                new_arg = arg.copy(name=arg_name)
-                if isinstance(arg, _ShapedArg):
-                    if arg.shape is not None:
-                        new_arg = new_arg.copy(shape=expand_defines_in_expr(arg.shape, defines))
-                    if arg.strides is not None:
-                        new_arg = new_arg.copy(strides=expand_defines_in_expr(arg.strides, defines))
-
-                processed_args.append(new_arg)
-
-        # }}}
-
-        index_dtype = np.dtype(index_dtype)
-        if index_dtype.kind != 'i':
-            raise TypeError("index_dtype must be an integer")
-        if np.iinfo(index_dtype).min >= 0:
-            raise TypeError("index_dtype must be signed")
-
-        if get_grid_sizes is not None:
-            # overwrites method down below
-            self.get_grid_sizes = get_grid_sizes
-
-        Record.__init__(self,
-                device=device, domains=domains,
-                instructions=parsed_instructions,
-                args=processed_args,
-                schedule=schedule,
-                name=name,
-                preambles=preambles,
-                preamble_generators=preamble_generators,
-                assumptions=assumptions,
-                iname_slab_increments=iname_slab_increments,
-                temporary_variables=temporary_variables,
-                local_sizes=local_sizes,
-                iname_to_tag=iname_to_tag,
-                iname_to_tag_requests=iname_to_tag_requests,
-                substitutions=substitutions,
-                cache_manager=cache_manager,
-                applied_iname_rewrites=applied_iname_rewrites,
-                function_manglers=function_manglers,
-                symbol_manglers=symbol_manglers,
-                index_dtype=index_dtype,
-                isl_context=isl_context)
-
-    # }}}
-
-    # {{{ function mangling
-
-    def register_function_mangler(self, mangler):
-        return self.copy(
-                function_manglers=[mangler]+self.function_manglers)
-
-    def mangle_function(self, identifier, arg_dtypes):
-        for mangler in self.function_manglers:
-            mangle_result = mangler(identifier, arg_dtypes)
-            if mangle_result is not None:
-                return mangle_result
-
-        return None
-
-    # }}}
-
-    # {{{ name wrangling
-
-    @memoize_method
-    def non_iname_variable_names(self):
-        return (set(self.arg_dict.iterkeys())
-                | set(self.temporary_variables.iterkeys()))
-
-    @memoize_method
-    def all_variable_names(self):
-        return (
-                set(self.temporary_variables.iterkeys())
-                | set(self.substitutions.iterkeys())
-                | set(arg.name for arg in self.args)
-                | set(self.all_inames()))
-
-    def get_var_name_generator(self):
-        return _UniqueNameGenerator(self.all_variable_names())
-
-    def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
-        if insns is None:
-            insns = self.instructions
-
-        used_ids = set(insn.id for insn in insns) | extra_used_ids
-
-        for id_str in _generate_unique_possibilities(based_on):
-            if id_str not in used_ids:
-                return id_str
-
-    def get_var_descriptor(self, name):
-        try:
-            return self.arg_dict[name]
-        except KeyError:
-            pass
-
-        try:
-            return self.temporary_variables[name]
-        except KeyError:
-            pass
-
-        raise ValueError("nothing known about variable '%s'" % name)
-
-    @property
-    @memoize_method
-    def id_to_insn(self):
-        return dict((insn.id, insn) for insn in self.instructions)
-
-    # }}}
-
-    # {{{ domain wrangling
-
-    @memoize_method
-    def parents_per_domain(self):
-        """Return a list corresponding to self.domains (by index)
-        containing domain indices which are nested around this
-        domain.
-
-        Each domains nest list walks from the leaves of the nesting
-        tree to the root.
-        """
-
-        # The stack of iname sets records which inames are active
-        # as we step through the linear list of domains. It also
-        # determines the granularity of inames to be popped/decactivated
-        # if we ascend a level.
-
-        iname_set_stack = []
-        result = []
-
-        writer_map = self.writer_map()
-
-        for dom in self.domains:
-            parameters = set(dom.get_var_names(dim_type.param))
-            inames = set(dom.get_var_names(dim_type.set))
-
-            # This next domain may be nested inside the previous domain.
-            # Or it may not, in which case we need to figure out how many
-            # levels of parents we need to discard in order to find the
-            # true parent.
-
-            discard_level_count = 0
-            while discard_level_count < len(iname_set_stack):
-                # {{{ check for parenthood by loop bound iname
-
-                last_inames = iname_set_stack[-1-discard_level_count]
-                if last_inames & parameters:
-                    break
-
-                # }}}
-
-                # {{{ check for parenthood by written variable
-
-                is_parent_by_variable = False
-                for par in parameters:
-                    if par in self.temporary_variables:
-                        writer_insns = writer_map[par]
-
-                        if len(writer_insns) > 1:
-                            raise RuntimeError("loop bound '%s' "
-                                    "may only be written to once" % par)
-
-                        writer_insn, = writer_insns
-                        writer_inames = self.insn_inames(writer_insn)
-
-                        if writer_inames & last_inames:
-                            is_parent_by_variable = True
-                            break
-
-                if is_parent_by_variable:
-                    break
-
-                # }}}
-
-                discard_level_count += 1
-
-            if discard_level_count:
-                iname_set_stack = iname_set_stack[:-discard_level_count]
-
-            if result:
-                parent = len(result)-1
-            else:
-                parent = None
-
-            for i in range(discard_level_count):
-                assert parent is not None
-                parent = result[parent]
-
-            # found this domain's parent
-            result.append(parent)
-
-            if iname_set_stack:
-                parent_inames = iname_set_stack[-1]
-            else:
-                parent_inames = set()
-            iname_set_stack.append(parent_inames | inames)
-
-        return result
-
-    @memoize_method
-    def all_parents_per_domain(self):
-        """Return a list corresponding to self.domains (by index)
-        containing domain indices which are nested around this
-        domain.
-
-        Each domains nest list walks from the leaves of the nesting
-        tree to the root.
-        """
-        result = []
-
-        ppd = self.parents_per_domain()
-        for dom, parent in zip(self.domains, ppd):
-            # keep walking up tree to find *all* parents
-            dom_result = []
-            while parent is not None:
-                dom_result.insert(0, parent)
-                parent = ppd[parent]
-
-            result.append(dom_result)
-
-        return result
-
-    @memoize_method
-    def _get_home_domain_map(self):
-        return dict(
-                (iname, i_domain)
-                for i_domain, dom in enumerate(self.domains)
-                for iname in dom.get_var_names(dim_type.set))
-
-    def get_home_domain_index(self, iname):
-        return self._get_home_domain_map()[iname]
-
-    @memoize_method
-    def combine_domains(self, domains):
-        """
-        :arg domains: domain indices of domains to be combined. More 'dominant'
-            domains (those which get most say on the actual dim_type of an iname)
-            must be later in the order.
-        """
-        assert isinstance(domains, tuple) # for caching
-
-        if not domains:
-            return isl.BasicSet.universe(isl.Space.set_alloc(
-                self.isl_context, 0, 0))
-
-        result = None
-        for dom_index in domains:
-            dom = self.domains[dom_index]
-            if result is None:
-                result = dom
-            else:
-                aligned_dom, aligned_result = isl.align_two(
-                        dom, result, across_dim_types=True)
-                result = aligned_result & aligned_dom
-
-        return result
-
-    def get_inames_domain(self, inames):
-        if not inames:
-            return self.combine_domains(())
-
-        if isinstance(inames, str):
-            inames = frozenset([inames])
-        if not isinstance(inames, frozenset):
-            inames = frozenset(inames)
-
-            from warnings import warn
-            warn("get_inames_domain did not get a frozenset", stacklevel=2)
-
-        return self._get_inames_domain_backend(inames)
-
-    @memoize_method
-    def get_leaf_domain_indices(self, inames):
-        """Find the leaves of the domain tree needed to cover all inames."""
-
-        hdm = self._get_home_domain_map()
-        ppd = self.all_parents_per_domain()
-
-        domain_indices = set()
-
-        # map root -> leaf
-        root_to_leaf = {}
-
-        for iname in inames:
-            home_domain_index = hdm[iname]
-            if home_domain_index in domain_indices:
-                # nothin' new
-                continue
-
-            domain_parents = [home_domain_index] + ppd[home_domain_index]
-            current_root = domain_parents[-1]
-            previous_leaf = root_to_leaf.get(current_root)
-
-            if previous_leaf is not None:
-                # Check that we don't branch the domain tree.
-                #
-                # Branching the domain tree is dangerous/ill-formed because
-                # it can introduce artificial restrictions on variables
-                # further up the tree.
-
-                prev_parents = set(ppd[previous_leaf])
-                if not prev_parents <= set(domain_parents):
-                    raise CannotBranchDomainTree("iname set '%s' requires "
-                            "branch in domain tree (when adding '%s')"
-                            % (", ".join(inames), iname))
-            else:
-                # We're adding a new root. That's fine.
-                pass
-
-            root_to_leaf[current_root] = home_domain_index
-            domain_indices.update(domain_parents)
-
-        return root_to_leaf.values()
-
-    @memoize_method
-    def _get_inames_domain_backend(self, inames):
-        domain_indices = set()
-        for leaf_dom_idx in self.get_leaf_domain_indices(inames):
-            domain_indices.add(leaf_dom_idx)
-            domain_indices.update(self.all_parents_per_domain()[leaf_dom_idx])
-
-        return self.combine_domains(tuple(sorted(domain_indices)))
-
-    # }}}
-
-    # {{{ iname wrangling
-
-    @memoize_method
-    def all_inames(self):
-        result = set()
-        for dom in self.domains:
-            result.update(dom.get_var_names(dim_type.set))
-        return frozenset(result)
-
-    @memoize_method
-    def all_params(self):
-        all_inames = self.all_inames()
-
-        result = set()
-        for dom in self.domains:
-            result.update(set(dom.get_var_names(dim_type.param)) - all_inames)
-
-        return frozenset(result)
-
-    @memoize_method
-    def all_insn_inames(self):
-        """Return a mapping from instruction ids to inames inside which
-        they should be run.
-        """
-
-        return find_all_insn_inames(self)
-
-    @memoize_method
-    def all_referenced_inames(self):
-        result = set()
-        for inames in self.all_insn_inames().itervalues():
-            result.update(inames)
-        return result
-
-    def insn_inames(self, insn):
-        if isinstance(insn, Instruction):
-            return self.all_insn_inames()[insn.id]
-        else:
-            return self.all_insn_inames()[insn]
-
-    @memoize_method
-    def iname_to_insns(self):
-        result = dict(
-                (iname, set()) for iname in self.all_inames())
-        for insn in self.instructions:
-            for iname in self.insn_inames(insn):
-                result[iname].add(insn.id)
-
-        return result
-
-    # }}}
-
-    # {{{ read and written variables
-
-    @memoize_method
-    def reader_map(self):
-        """
-        :return: a dict that maps variable names to ids of insns that read that variable.
-        """
-        result = {}
-
-        admissible_vars = (
-                set(arg.name for arg in self.args)
-                | set(self.temporary_variables.iterkeys()))
-
-        for insn in self.instructions:
-            for var_name in insn.get_read_var_names() & admissible_vars:
-                result.setdefault(var_name, set()).add(insn.id)
-
-    @memoize_method
-    def writer_map(self):
-        """
-        :return: a dict that maps variable names to ids of insns that write to that variable.
-        """
-        result = {}
-
-        for insn in self.instructions:
-            var_name = insn.get_assignee_var_name()
-            var_names = [var_name]
-
-            for var_name in var_names:
-                result.setdefault(var_name, set()).add(insn.id)
-
-        return result
-
-    @memoize_method
-    def get_read_variables(self):
-        result = set()
-        for insn in self.instructions:
-            result.update(insn.get_read_var_names())
-        return result
-
-    @memoize_method
-    def get_written_variables(self):
-        return frozenset(
-            insn.get_assignee_var_name()
-            for insn in self.instructions)
-
-    # }}}
-
-    # {{{ argument wrangling
-
-    @property
-    @memoize_method
-    def arg_dict(self):
-        return dict((arg.name, arg) for arg in self.args)
-
-    @property
-    @memoize_method
-    def scalar_loop_args(self):
-        if self.args is None:
-            return []
-        else:
-            from pytools import flatten
-            loop_arg_names = list(flatten(dom.get_var_names(dim_type.param)
-                    for dom in self.domains))
-            return [arg.name for arg in self.args if isinstance(arg, ValueArg)
-                    if arg.name in loop_arg_names]
-    # }}}
-
-    # {{{ bounds finding
-
-    @memoize_method
-    def get_iname_bounds(self, iname):
-        domain = self.get_inames_domain(frozenset([iname]))
-        d_var_dict = domain.get_var_dict()
-
-        assumptions, domain = isl.align_two(self.assumptions, domain)
-
-        dom_intersect_assumptions = assumptions & domain
-
-        lower_bound_pw_aff = (
-                self.cache_manager.dim_min(
-                    dom_intersect_assumptions,
-                    d_var_dict[iname][1])
-                .coalesce())
-        upper_bound_pw_aff = (
-                self.cache_manager.dim_max(
-                    dom_intersect_assumptions,
-                    d_var_dict[iname][1])
-                .coalesce())
-
-        class BoundsRecord(Record):
-            pass
-
-        size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
-        size = size.gist(self.assumptions)
-
-        return BoundsRecord(
-                lower_bound_pw_aff=lower_bound_pw_aff,
-                upper_bound_pw_aff=upper_bound_pw_aff,
-                size=size)
-
-    def find_var_base_indices_and_shape_from_inames(
-            self, inames, cache_manager, context=None):
-        if not inames:
-            return [], []
-
-        base_indices_and_sizes = [
-                cache_manager.base_index_and_length(
-                    self.get_inames_domain(iname), iname, context)
-                for iname in inames]
-        return zip(*base_indices_and_sizes)
-
-    @memoize_method
-    def get_constant_iname_length(self, iname):
-        from loopy.isl_helpers import static_max_of_pw_aff
-        from loopy.symbolic import aff_to_expr
-        return int(aff_to_expr(static_max_of_pw_aff(
-                self.get_iname_bounds(iname).size,
-                constants_only=True)))
-
-    @memoize_method
-    def get_grid_sizes(self, ignore_auto=False):
-        all_inames_by_insns = set()
-        for insn in self.instructions:
-            all_inames_by_insns |= self.insn_inames(insn)
-
-        if not all_inames_by_insns <= self.all_inames():
-            raise RuntimeError("some inames collected from instructions (%s) "
-                    "are not present in domain (%s)"
-                    % (", ".join(sorted(all_inames_by_insns)),
-                        ", ".join(sorted(self.all_inames()))))
-
-        global_sizes = {}
-        local_sizes = {}
-
-        from loopy.kernel import (
-                GroupIndexTag, LocalIndexTag,
-                AutoLocalIndexTagBase)
-
-        for iname in self.all_inames():
-            tag = self.iname_to_tag.get(iname)
-
-            if isinstance(tag, GroupIndexTag):
-                tgt_dict = global_sizes
-            elif isinstance(tag, LocalIndexTag):
-                tgt_dict = local_sizes
-            elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
-                raise RuntimeError("cannot find grid sizes if automatic local index tags are "
-                        "present")
-            else:
-                tgt_dict = None
-
-            if tgt_dict is None:
-                continue
-
-            size = self.get_iname_bounds(iname).size
-
-            if tag.axis in tgt_dict:
-                size = tgt_dict[tag.axis].max(size)
-
-            from loopy.isl_helpers import static_max_of_pw_aff
-            try:
-                # insist block size is constant
-                size = static_max_of_pw_aff(size,
-                        constants_only=isinstance(tag, LocalIndexTag))
-            except ValueError:
-                pass
-
-            tgt_dict[tag.axis] = size
-
-        max_dims = self.device.max_work_item_dimensions
-
-        def to_dim_tuple(size_dict, which, forced_sizes={}):
-            forced_sizes = forced_sizes.copy()
-
-            size_list = []
-            sorted_axes = sorted(size_dict.iterkeys())
-
-            while sorted_axes or forced_sizes:
-                if sorted_axes:
-                    cur_axis = sorted_axes.pop(0)
-                else:
-                    cur_axis = None
-
-                if len(size_list) in forced_sizes:
-                    size_list.append(
-                           forced_sizes.pop(len(size_list)))
-                    continue
-
-                assert cur_axis is not None
-
-                if cur_axis > len(size_list):
-                    raise RuntimeError("%s axis %d unused" % (
-                        which, len(size_list)))
-
-                size_list.append(size_dict[cur_axis])
-
-            if len(size_list) > max_dims:
-                raise ValueError("more %s dimensions assigned than supported "
-                        "by hardware (%d > %d)" % (which, len(size_list), max_dims))
-
-            return tuple(size_list)
-
-        return (to_dim_tuple(global_sizes, "global"),
-                to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
-
-    def get_grid_sizes_as_exprs(self, ignore_auto=False):
-        grid_size, group_size = self.get_grid_sizes(ignore_auto=ignore_auto)
-
-        def tup_to_exprs(tup):
-            from loopy.symbolic import pw_aff_to_expr
-            return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
-
-        return tup_to_exprs(grid_size), tup_to_exprs(group_size)
-
-    # }}}
-
-    # {{{ local memory
-
-    @memoize_method
-    def local_var_names(self):
-        return set(
-                tv.name
-            for tv in self.temporary_variables.itervalues()
-            if tv.is_local)
-
-    def local_mem_use(self):
-        return sum(lv.nbytes for lv in self.temporary_variables.itervalues()
-                if lv.is_local)
-
-    # }}}
-
-    # {{{ pretty-printing
-
-    def __str__(self):
-        lines = []
-
-        sep = 75*"-"
-        lines.append(sep)
-        lines.append("INAME-TO-TAG MAP:")
-        for iname in sorted(self.all_inames()):
-            line = "%s: %s" % (iname, self.iname_to_tag.get(iname))
-            lines.append(line)
-
-        lines.append(sep)
-        lines.append("DOMAINS:")
-        for dom, parents in zip(self.domains, self.all_parents_per_domain()):
-            lines.append(len(parents)*"  " + str(dom))
-
-        if self.substitutions:
-            lines.append(sep)
-            lines.append("SUBSTIUTION RULES:")
-            for rule_name in sorted(self.substitutions.iterkeys()):
-                lines.append(str(self.substitutions[rule_name]))
-
-        lines.append(sep)
-        lines.append("INSTRUCTIONS:")
-        loop_list_width = 35
-        for insn in self.instructions:
-            loop_list = ",".join(sorted(self.insn_inames(insn)))
-
-            options = [insn.id]
-            if insn.priority:
-                options.append("priority=%d" % insn.priority)
-
-            if len(loop_list) > loop_list_width:
-                lines.append("[%s]" % loop_list)
-                lines.append("%s%s <- %s   # %s" % (
-                    (loop_list_width+2)*" ", insn.assignee,
-                    insn.expression, ", ".join(options)))
-            else:
-                lines.append("[%s]%s%s <- %s   # %s" % (
-                    loop_list, " "*(loop_list_width-len(loop_list)),
-                    insn.assignee, insn.expression, ", ".join(options)))
-
-        lines.append(sep)
-        lines.append("DEPENDENCIES:")
-        for insn in self.instructions:
-            if insn.insn_deps:
-                lines.append("%s : %s" % (insn.id, ",".join(insn.insn_deps)))
-        lines.append(sep)
-
-        if self.schedule is not None:
-            lines.append("SCHEDULE:")
-            from loopy.schedule import dump_schedule
-            lines.append(dump_schedule(self.schedule))
-            lines.append(sep)
-
-        return "\n".join(lines)
-
-    # }}}
-
-# }}}
-
-# {{{ add and infer argument dtypes
-
-def add_argument_dtypes(knl, dtype_dict):
-    dtype_dict = dtype_dict.copy()
-    new_args = []
-
-    for arg in knl.args:
-        new_dtype = dtype_dict.pop(arg.name, None)
-        if new_dtype is not None:
-            new_dtype = np.dtype(new_dtype)
-            if arg.dtype is not None and arg.dtype != new_dtype:
-                raise RuntimeError(
-                        "argument '%s' already has a different dtype "
-                        "(existing: %s, new: %s)"
-                        % (arg.name, arg.dtype, new_dtype))
-            arg = arg.copy(dtype=new_dtype)
-
-        new_args.append(arg)
-
-    knl = knl.copy(args=new_args)
-
-    if dtype_dict:
-        raise RuntimeError("unused argument dtypes: %s"
-                % ", ".join(dtype_dict))
-
-    return knl.copy(args=new_args)
-
-def infer_argument_dtypes(knl):
-    new_args = []
-
-    writer_map = knl.writer_map()
-
-    from loopy.codegen.expression import (
-            TypeInferenceMapper, TypeInferenceFailure)
-    tim = TypeInferenceMapper(knl)
-
-    for arg in knl.args:
-        if arg.dtype is None:
-            new_dtype = None
-
-            if arg.name in knl.all_params():
-                new_dtype = knl.index_dtype
-            else:
-                try:
-                    for write_insn_id in writer_map.get(arg.name, ()):
-                        write_insn = knl.id_to_insn[write_insn_id]
-                        new_tim_dtype = tim(write_insn.expression)
-                        if new_dtype is None:
-                            new_dtype = new_tim_dtype
-                        elif new_dtype != new_tim_dtype:
-                            # Now we know *nothing*.
-                            new_dtype = None
-                            break
-
-                except TypeInferenceFailure:
-                    # Even one type inference failure is enough to
-                    # make this dtype not safe to guess. Don't.
-                    pass
-
-            if new_dtype is not None:
-                arg = arg.copy(dtype=new_dtype)
-
-        new_args.append(arg)
-
-    return knl.copy(args=new_args)
-
-def get_arguments_with_incomplete_dtype(knl):
-    return [arg.name for arg in knl.args
-            if arg.dtype is None]
-
-# }}}
-
-# {{{ find_all_insn_inames fixed point iteration
-
-def find_all_insn_inames(kernel):
-    from loopy.symbolic import get_dependencies
-
-    writer_map = kernel.writer_map()
-
-    insn_id_to_inames = {}
-    insn_assignee_inames = {}
-
-    all_read_deps = {}
-    all_write_deps = {}
-
-    from loopy.subst import expand_subst
-    kernel = expand_subst(kernel)
-
-    for insn in kernel.instructions:
-        all_read_deps[insn.id] = read_deps = get_dependencies(insn.expression)
-        all_write_deps[insn.id] = write_deps = get_dependencies(insn.assignee)
-        deps = read_deps | write_deps
-
-        iname_deps = (
-                deps & kernel.all_inames()
-                | insn.forced_iname_deps)
-
-        insn_id_to_inames[insn.id] = iname_deps
-        insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
-
-    temp_var_names = set(kernel.temporary_variables.iterkeys())
-
-    # fixed point iteration until all iname dep sets have converged
-
-    # Why is fixed point iteration necessary here? Consider the following
-    # scenario:
-    #
-    # z = expr(iname)
-    # y = expr(z)
-    # x = expr(y)
-    #
-    # x clearly has a dependency on iname, but this is not found until that
-    # dependency has propagated all the way up. Doing this recursively is
-    # not guaranteed to terminate because of circular dependencies.
-
-    while True:
-        did_something = False
-        for insn in kernel.instructions:
-
-            # {{{ depdency-based propagation
-
-            # For all variables that insn depends on, find the intersection
-            # of iname deps of all writers, and add those to insn's
-            # dependencies.
-
-            for tv_name in (all_read_deps[insn.id] & temp_var_names):
-                implicit_inames = None
-
-                for writer_id in writer_map[tv_name]:
-                    writer_implicit_inames = (
-                            insn_id_to_inames[writer_id]
-                            - insn_assignee_inames[writer_id])
-                    if implicit_inames is None:
-                        implicit_inames = writer_implicit_inames
-                    else:
-                        implicit_inames = (implicit_inames
-                                & writer_implicit_inames)
-
-                inames_old = insn_id_to_inames[insn.id]
-                inames_new = (inames_old | implicit_inames) \
-                            - insn.reduction_inames()
-                insn_id_to_inames[insn.id] = inames_new
-
-                if inames_new != inames_old:
-                    did_something = True
-
-            # }}}
-
-            # {{{ domain-based propagation
-
-            # Add all inames occurring in parameters of domains that my current
-            # inames refer to.
-
-            inames_old = insn_id_to_inames[insn.id]
-            inames_new = set(insn_id_to_inames[insn.id])
-
-            for iname in inames_old:
-                home_domain = kernel.domains[kernel.get_home_domain_index(iname)]
-
-                for par in home_domain.get_var_names(dim_type.param):
-                    if par in kernel.all_inames():
-                        inames_new.add(par)
-
-            if inames_new != inames_old:
-                did_something = True
-                insn_id_to_inames[insn.id] = frozenset(inames_new)
-
-            # }}}
-
-        if not did_something:
-            break
-
-    return insn_id_to_inames
-
-# }}}
-
-# {{{ set operation cache
-
-class SetOperationCacheManager:
-    def __init__(self):
-        # mapping: set hash -> [(set, op, args, result)]
-        self.cache = {}
-
-    def op(self, set, op_name, op, args):
-        hashval = hash(set)
-        bucket = self.cache.setdefault(hashval, [])
-
-        for bkt_set, bkt_op, bkt_args, result  in bucket:
-            if set.plain_is_equal(bkt_set) and op == bkt_op and args == bkt_args:
-                return result
-
-        #print op, set.get_dim_name(dim_type.set, args[0])
-        result = op(*args)
-        bucket.append((set, op_name, args, result))
-        return result
-
-    def dim_min(self, set, *args):
-        return self.op(set, "dim_min", set.dim_min, args)
-
-    def dim_max(self, set, *args):
-        return self.op(set, "dim_max", set.dim_max, args)
-
-    def base_index_and_length(self, set, iname, context=None):
-        iname_to_dim = set.space.get_var_dict()
-        lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1])
-        upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1])
-
-        from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
-        from loopy.symbolic import pw_aff_to_expr
-
-        size = pw_aff_to_expr(static_max_of_pw_aff(
-                upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True,
-                context=context))
-        base_index = pw_aff_to_expr(
-            static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False,
-                context=context))
-
-        return base_index, size
-
-# }}}
-
-# {{{ domain change helper
-
-class DomainChanger:
-    """Helps change the domain responsible for *inames* within a kernel.
-
-    .. note: Does not perform an in-place change!
-    """
-
-    def __init__(self, kernel, inames):
-        self.kernel = kernel
-        if inames:
-            ldi = kernel.get_leaf_domain_indices(inames)
-            if len(ldi) > 1:
-                raise RuntimeError("Inames '%s' require more than one leaf "
-                        "domain, which makes the domain change that is part "
-                        "of your current operation ambiguous." % ", ".join(inames))
-
-            self.leaf_domain_index, = ldi
-            self.domain = kernel.domains[self.leaf_domain_index]
-
-        else:
-            self.domain = kernel.combine_domains(())
-            self.leaf_domain_index = None
-
-    def get_domains_with(self, replacement):
-        result = self.kernel.domains[:]
-        if self.leaf_domain_index is not None:
-            result[self.leaf_domain_index] = replacement
-        else:
-            result.append(replacement)
-
-        return result
-
-# }}}
-
-# {{{ graphviz / dot export
-
-def get_dot_dependency_graph(kernel, iname_cluster=False, iname_edge=True):
-    lines = []
-    for insn in kernel.instructions:
-        lines.append("%s [shape=\"box\"];" % insn.id)
-        for dep in insn.insn_deps:
-            lines.append("%s -> %s;" % (dep, insn.id))
-
-        if iname_edge:
-            for iname in kernel.insn_inames(insn):
-                lines.append("%s -> %s [style=\"dotted\"];" % (iname, insn.id))
-
-    if iname_cluster:
-        for iname in kernel.all_inames():
-            lines.append("subgraph cluster_%s { label=\"%s\" %s }" % (iname, iname,
-                " ".join(insn.id for insn in kernel.instructions
-                    if iname in kernel.insn_inames(insn))))
-
-    return "digraph loopy_deps {\n%s\n}" % "\n".join(lines)
-
-# }}}
-
-# vim: foldmethod=marker
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b07a487fc192b6fc6ab9ae1063e730872e6e591f
--- /dev/null
+++ b/loopy/kernel/__init__.py
@@ -0,0 +1,887 @@
+"""Kernel object."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+
+
+
+import numpy as np
+from pytools import Record, memoize_method
+import islpy as isl
+from islpy import dim_type
+
+from loopy.kernel.creation import UniqueNameGenerator, generate_unique_possibilities
+
+from loopy.kernel.data import (
+        default_function_mangler,
+        opencl_function_mangler,
+        single_arg_function_mangler,
+
+        opencl_symbol_mangler,
+
+        default_preamble_generator,
+        )
+
+
+
+
+class CannotBranchDomainTree(RuntimeError):
+    pass
+
+# {{{ loop kernel object
+
+class LoopKernel(Record):
+    """
+    :ivar device: :class:`pyopencl.Device`
+    :ivar domains: :class:`islpy.BasicSet`
+    :ivar instructions:
+    :ivar args:
+    :ivar schedule:
+    :ivar name:
+    :ivar preambles: a list of (tag, code) tuples that identify preamble snippets.
+        Each tag's snippet is only included once, at its first occurrence.
+        The preambles will be inserted in order of their tags.
+    :ivar preamble_generators: a list of functions of signature
+        (seen_dtypes, seen_functions) where seen_functions is a set of
+        (name, c_name, arg_dtypes), generating extra entries for `preambles`.
+    :ivar assumptions: the initial implemented_domain, captures assumptions
+        on the parameters. (an isl.Set)
+    :ivar local_sizes: A dictionary from integers to integers, mapping
+        workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be
+        length 16.
+    :ivar temporary_variables:
+    :ivar iname_to_tag:
+    :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
+        objects
+    :ivar function_manglers: list of functions of signature (name, arg_dtypes)
+        returning a tuple (result_dtype, c_name)
+        or a tuple (result_dtype, c_name, arg_dtypes),
+        where c_name is the C-level function to be called.
+    :ivar symbol_manglers: list of functions of signature (name) returning
+        a tuple (result_dtype, c_name), where c_name is the C-level symbol to be
+        evaluated.
+    :ivar defines: a dictionary of replacements to be made in instructions given
+        as strings before parsing. A macro instance intended to be replaced should
+        look like "MACRO" in the instruction code. The expansion given in this
+        parameter is allowed to be a list. In this case, instructions are generated
+        for *each* combination of macro values.
+
+        These defines may also be used in the domain and in argument shapes and
+        strides. They are expanded only upon kernel creation.
+
+    The following arguments are not user-facing:
+
+    :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
+        upper_incr) tuples that will be separated out in the execution to generate
+        'bulk' slabs with fewer conditionals.
+    :ivar applied_iname_rewrites: A list of past substitution dictionaries that
+        were applied to the kernel. These are stored so that they may be repeated
+        on expressions the user specifies later.
+    :ivar cache_manager:
+    :ivar isl_context:
+    """
+
+    # {{{ constructor
+
+    def __init__(self, device, domains, instructions, args=[], schedule=None,
+            name="loopy_kernel",
+            preambles=[],
+            preamble_generators=[default_preamble_generator],
+            assumptions=None,
+            local_sizes={},
+            temporary_variables={},
+            iname_to_tag={},
+            substitutions={},
+            function_manglers=[
+                default_function_mangler,
+                opencl_function_mangler,
+                single_arg_function_mangler,
+                ],
+            symbol_manglers=[opencl_symbol_mangler],
+            defines={},
+
+            # non-user-facing
+            iname_slab_increments={},
+            applied_iname_rewrites=[],
+            cache_manager=None,
+            index_dtype=np.int32,
+            isl_context=None,
+
+            # When kernels get intersected in slab decomposition,
+            # their grid sizes shouldn't change. This provides
+            # a way to forward sub-kernel grid size requests.
+            get_grid_sizes=None):
+        """
+        :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
+            Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
+        """
+
+        if cache_manager is None:
+            from loopy.kernel.tools import SetOperationCacheManager
+            cache_manager = SetOperationCacheManager()
+
+        # {{{ make instruction ids unique
+
+        from loopy.kernel.creation import MakeUnique
+
+        insn_ids = set()
+        for insn in instructions:
+            if insn.id is not None and not isinstance(insn.id, MakeUnique):
+                if insn.id in insn_ids:
+                    raise RuntimeError("duplicate instruction id: %s" % insn.id)
+                insn_ids.add(insn.id)
+
+        insn_id_gen = UniqueNameGenerator(insn_ids)
+
+        new_instructions = []
+
+        for insn in instructions:
+            if insn.id is None:
+                new_instructions.append(
+                        insn.copy(id=insn_id_gen("insn")))
+            elif isinstance(insn.id, MakeUnique):
+                new_instructions.append(
+                        insn.copy(id=insn_id_gen(insn.id.name)))
+            else:
+                new_instructions.append(insn)
+
+        instructions = new_instructions
+        del new_instructions
+
+        # }}}
+
+        # {{{ process assumptions
+
+        if assumptions is None:
+            dom0_space = domains[0].get_space()
+            assumptions_space = isl.Space.params_alloc(
+                    dom0_space.get_ctx(), dom0_space.dim(dim_type.param))
+            for i in xrange(dom0_space.dim(dim_type.param)):
+                assumptions_space = assumptions_space.set_dim_name(
+                        dim_type.param, i, dom0_space.get_dim_name(dim_type.param, i))
+            assumptions = isl.BasicSet.universe(assumptions_space)
+
+        elif isinstance(assumptions, str):
+            all_inames = set()
+            all_params = set()
+            for dom in domains:
+                all_inames.update(dom.get_var_names(dim_type.set))
+                all_params.update(dom.get_var_names(dim_type.param))
+
+            domain_parameters = all_params-all_inames
+
+            assumptions_set_str = "[%s] -> { : %s}" \
+                    % (",".join(s for s in domain_parameters),
+                        assumptions)
+            assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(),
+                    assumptions_set_str)
+
+        assert assumptions.is_params()
+
+        # }}}
+
+        # {{{ expand macros in arg shapes
+
+        from loopy.kernel.data import ShapedArg
+        from loopy.kernel.creation import expand_defines_in_expr
+
+        processed_args = []
+        for arg in args:
+            for arg_name in arg.name.split(","):
+                new_arg = arg.copy(name=arg_name)
+                if isinstance(arg, ShapedArg):
+                    if arg.shape is not None:
+                        new_arg = new_arg.copy(shape=expand_defines_in_expr(arg.shape, defines))
+                    if arg.strides is not None:
+                        new_arg = new_arg.copy(strides=expand_defines_in_expr(arg.strides, defines))
+
+                processed_args.append(new_arg)
+
+        # }}}
+
+        index_dtype = np.dtype(index_dtype)
+        if index_dtype.kind != 'i':
+            raise TypeError("index_dtype must be an integer")
+        if np.iinfo(index_dtype).min >= 0:
+            raise TypeError("index_dtype must be signed")
+
+        if get_grid_sizes is not None:
+            # overwrites method down below
+            self.get_grid_sizes = get_grid_sizes
+
+        Record.__init__(self,
+                device=device, domains=domains,
+                instructions=instructions,
+                args=processed_args,
+                schedule=schedule,
+                name=name,
+                preambles=preambles,
+                preamble_generators=preamble_generators,
+                assumptions=assumptions,
+                iname_slab_increments=iname_slab_increments,
+                temporary_variables=temporary_variables,
+                local_sizes=local_sizes,
+                iname_to_tag=iname_to_tag,
+                substitutions=substitutions,
+                cache_manager=cache_manager,
+                applied_iname_rewrites=applied_iname_rewrites,
+                function_manglers=function_manglers,
+                symbol_manglers=symbol_manglers,
+                index_dtype=index_dtype,
+                isl_context=isl_context)
+
+    # }}}
+
+    # {{{ function mangling
+
+    def register_function_mangler(self, mangler):
+        return self.copy(
+                function_manglers=[mangler]+self.function_manglers)
+
+    def mangle_function(self, identifier, arg_dtypes):
+        for mangler in self.function_manglers:
+            mangle_result = mangler(identifier, arg_dtypes)
+            if mangle_result is not None:
+                return mangle_result
+
+        return None
+
+    # }}}
+
+    # {{{ name wrangling
+
+    @memoize_method
+    def non_iname_variable_names(self):
+        return (set(self.arg_dict.iterkeys())
+                | set(self.temporary_variables.iterkeys()))
+
+    @memoize_method
+    def all_variable_names(self):
+        return (
+                set(self.temporary_variables.iterkeys())
+                | set(self.substitutions.iterkeys())
+                | set(arg.name for arg in self.args)
+                | set(self.all_inames()))
+
+    def get_var_name_generator(self):
+        return UniqueNameGenerator(self.all_variable_names())
+
+    def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
+        if insns is None:
+            insns = self.instructions
+
+        used_ids = set(insn.id for insn in insns) | extra_used_ids
+
+        for id_str in generate_unique_possibilities(based_on):
+            if id_str not in used_ids:
+                return id_str
+
+    def get_var_descriptor(self, name):
+        try:
+            return self.arg_dict[name]
+        except KeyError:
+            pass
+
+        try:
+            return self.temporary_variables[name]
+        except KeyError:
+            pass
+
+        raise ValueError("nothing known about variable '%s'" % name)
+
+    @property
+    @memoize_method
+    def id_to_insn(self):
+        return dict((insn.id, insn) for insn in self.instructions)
+
+    # }}}
+
+    # {{{ domain wrangling
+
+    @memoize_method
+    def parents_per_domain(self):
+        """Return a list corresponding to self.domains (by index)
+        containing domain indices which are nested around this
+        domain.
+
+        Each domains nest list walks from the leaves of the nesting
+        tree to the root.
+        """
+
+        # The stack of iname sets records which inames are active
+        # as we step through the linear list of domains. It also
+        # determines the granularity of inames to be popped/decactivated
+        # if we ascend a level.
+
+        iname_set_stack = []
+        result = []
+
+        writer_map = self.writer_map()
+
+        for dom in self.domains:
+            parameters = set(dom.get_var_names(dim_type.param))
+            inames = set(dom.get_var_names(dim_type.set))
+
+            # This next domain may be nested inside the previous domain.
+            # Or it may not, in which case we need to figure out how many
+            # levels of parents we need to discard in order to find the
+            # true parent.
+
+            discard_level_count = 0
+            while discard_level_count < len(iname_set_stack):
+                # {{{ check for parenthood by loop bound iname
+
+                last_inames = iname_set_stack[-1-discard_level_count]
+                if last_inames & parameters:
+                    break
+
+                # }}}
+
+                # {{{ check for parenthood by written variable
+
+                is_parent_by_variable = False
+                for par in parameters:
+                    if par in self.temporary_variables:
+                        writer_insns = writer_map[par]
+
+                        if len(writer_insns) > 1:
+                            raise RuntimeError("loop bound '%s' "
+                                    "may only be written to once" % par)
+
+                        writer_insn, = writer_insns
+                        writer_inames = self.insn_inames(writer_insn)
+
+                        if writer_inames & last_inames:
+                            is_parent_by_variable = True
+                            break
+
+                if is_parent_by_variable:
+                    break
+
+                # }}}
+
+                discard_level_count += 1
+
+            if discard_level_count:
+                iname_set_stack = iname_set_stack[:-discard_level_count]
+
+            if result:
+                parent = len(result)-1
+            else:
+                parent = None
+
+            for i in range(discard_level_count):
+                assert parent is not None
+                parent = result[parent]
+
+            # found this domain's parent
+            result.append(parent)
+
+            if iname_set_stack:
+                parent_inames = iname_set_stack[-1]
+            else:
+                parent_inames = set()
+            iname_set_stack.append(parent_inames | inames)
+
+        return result
+
+    @memoize_method
+    def all_parents_per_domain(self):
+        """Return a list corresponding to self.domains (by index)
+        containing domain indices which are nested around this
+        domain.
+
+        Each domains nest list walks from the leaves of the nesting
+        tree to the root.
+        """
+        result = []
+
+        ppd = self.parents_per_domain()
+        for dom, parent in zip(self.domains, ppd):
+            # keep walking up tree to find *all* parents
+            dom_result = []
+            while parent is not None:
+                dom_result.insert(0, parent)
+                parent = ppd[parent]
+
+            result.append(dom_result)
+
+        return result
+
+    @memoize_method
+    def _get_home_domain_map(self):
+        return dict(
+                (iname, i_domain)
+                for i_domain, dom in enumerate(self.domains)
+                for iname in dom.get_var_names(dim_type.set))
+
+    def get_home_domain_index(self, iname):
+        return self._get_home_domain_map()[iname]
+
+    @memoize_method
+    def combine_domains(self, domains):
+        """
+        :arg domains: domain indices of domains to be combined. More 'dominant'
+            domains (those which get most say on the actual dim_type of an iname)
+            must be later in the order.
+        """
+        assert isinstance(domains, tuple) # for caching
+
+        if not domains:
+            return isl.BasicSet.universe(isl.Space.set_alloc(
+                self.isl_context, 0, 0))
+
+        result = None
+        for dom_index in domains:
+            dom = self.domains[dom_index]
+            if result is None:
+                result = dom
+            else:
+                aligned_dom, aligned_result = isl.align_two(
+                        dom, result, across_dim_types=True)
+                result = aligned_result & aligned_dom
+
+        return result
+
+    def get_inames_domain(self, inames):
+        if not inames:
+            return self.combine_domains(())
+
+        if isinstance(inames, str):
+            inames = frozenset([inames])
+        if not isinstance(inames, frozenset):
+            inames = frozenset(inames)
+
+            from warnings import warn
+            warn("get_inames_domain did not get a frozenset", stacklevel=2)
+
+        return self._get_inames_domain_backend(inames)
+
+    @memoize_method
+    def get_leaf_domain_indices(self, inames):
+        """Find the leaves of the domain tree needed to cover all inames."""
+
+        hdm = self._get_home_domain_map()
+        ppd = self.all_parents_per_domain()
+
+        domain_indices = set()
+
+        # map root -> leaf
+        root_to_leaf = {}
+
+        for iname in inames:
+            home_domain_index = hdm[iname]
+            if home_domain_index in domain_indices:
+                # nothin' new
+                continue
+
+            domain_parents = [home_domain_index] + ppd[home_domain_index]
+            current_root = domain_parents[-1]
+            previous_leaf = root_to_leaf.get(current_root)
+
+            if previous_leaf is not None:
+                # Check that we don't branch the domain tree.
+                #
+                # Branching the domain tree is dangerous/ill-formed because
+                # it can introduce artificial restrictions on variables
+                # further up the tree.
+
+                prev_parents = set(ppd[previous_leaf])
+                if not prev_parents <= set(domain_parents):
+                    raise CannotBranchDomainTree("iname set '%s' requires "
+                            "branch in domain tree (when adding '%s')"
+                            % (", ".join(inames), iname))
+            else:
+                # We're adding a new root. That's fine.
+                pass
+
+            root_to_leaf[current_root] = home_domain_index
+            domain_indices.update(domain_parents)
+
+        return root_to_leaf.values()
+
+    @memoize_method
+    def _get_inames_domain_backend(self, inames):
+        domain_indices = set()
+        for leaf_dom_idx in self.get_leaf_domain_indices(inames):
+            domain_indices.add(leaf_dom_idx)
+            domain_indices.update(self.all_parents_per_domain()[leaf_dom_idx])
+
+        return self.combine_domains(tuple(sorted(domain_indices)))
+
+    # }}}
+
+    # {{{ iname wrangling
+
+    @memoize_method
+    def all_inames(self):
+        result = set()
+        for dom in self.domains:
+            result.update(dom.get_var_names(dim_type.set))
+        return frozenset(result)
+
+    @memoize_method
+    def all_params(self):
+        all_inames = self.all_inames()
+
+        result = set()
+        for dom in self.domains:
+            result.update(set(dom.get_var_names(dim_type.param)) - all_inames)
+
+        return frozenset(result)
+
+    @memoize_method
+    def all_insn_inames(self):
+        """Return a mapping from instruction ids to inames inside which
+        they should be run.
+        """
+
+        from loopy.kernel.tools import find_all_insn_inames
+        return find_all_insn_inames(self)
+
+    @memoize_method
+    def all_referenced_inames(self):
+        result = set()
+        for inames in self.all_insn_inames().itervalues():
+            result.update(inames)
+        return result
+
+    def insn_inames(self, insn):
+        from loopy.kernel.data import Instruction
+        if isinstance(insn, Instruction):
+            return self.all_insn_inames()[insn.id]
+        else:
+            return self.all_insn_inames()[insn]
+
+    @memoize_method
+    def iname_to_insns(self):
+        result = dict(
+                (iname, set()) for iname in self.all_inames())
+        for insn in self.instructions:
+            for iname in self.insn_inames(insn):
+                result[iname].add(insn.id)
+
+        return result
+
+    # }}}
+
+    # {{{ read and written variables
+
+    @memoize_method
+    def reader_map(self):
+        """
+        :return: a dict that maps variable names to ids of insns that read that variable.
+        """
+        result = {}
+
+        admissible_vars = (
+                set(arg.name for arg in self.args)
+                | set(self.temporary_variables.iterkeys()))
+
+        for insn in self.instructions:
+            for var_name in insn.get_read_var_names() & admissible_vars:
+                result.setdefault(var_name, set()).add(insn.id)
+
+    @memoize_method
+    def writer_map(self):
+        """
+        :return: a dict that maps variable names to ids of insns that write to that variable.
+        """
+        result = {}
+
+        for insn in self.instructions:
+            var_name = insn.get_assignee_var_name()
+            var_names = [var_name]
+
+            for var_name in var_names:
+                result.setdefault(var_name, set()).add(insn.id)
+
+        return result
+
+    @memoize_method
+    def get_read_variables(self):
+        result = set()
+        for insn in self.instructions:
+            result.update(insn.get_read_var_names())
+        return result
+
+    @memoize_method
+    def get_written_variables(self):
+        return frozenset(
+            insn.get_assignee_var_name()
+            for insn in self.instructions)
+
+    # }}}
+
+    # {{{ argument wrangling
+
+    @property
+    @memoize_method
+    def arg_dict(self):
+        return dict((arg.name, arg) for arg in self.args)
+
+    @property
+    @memoize_method
+    def scalar_loop_args(self):
+        from loopy.kernel.data import ValueArg
+
+        if self.args is None:
+            return []
+        else:
+            from pytools import flatten
+            loop_arg_names = list(flatten(dom.get_var_names(dim_type.param)
+                    for dom in self.domains))
+            return [arg.name for arg in self.args if isinstance(arg, ValueArg)
+                    if arg.name in loop_arg_names]
+    # }}}
+
+    # {{{ bounds finding
+
+    @memoize_method
+    def get_iname_bounds(self, iname):
+        domain = self.get_inames_domain(frozenset([iname]))
+        d_var_dict = domain.get_var_dict()
+
+        assumptions, domain = isl.align_two(self.assumptions, domain)
+
+        dom_intersect_assumptions = assumptions & domain
+
+        lower_bound_pw_aff = (
+                self.cache_manager.dim_min(
+                    dom_intersect_assumptions,
+                    d_var_dict[iname][1])
+                .coalesce())
+        upper_bound_pw_aff = (
+                self.cache_manager.dim_max(
+                    dom_intersect_assumptions,
+                    d_var_dict[iname][1])
+                .coalesce())
+
+        class BoundsRecord(Record):
+            pass
+
+        size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
+        size = size.gist(self.assumptions)
+
+        return BoundsRecord(
+                lower_bound_pw_aff=lower_bound_pw_aff,
+                upper_bound_pw_aff=upper_bound_pw_aff,
+                size=size)
+
+    def find_var_base_indices_and_shape_from_inames(
+            self, inames, cache_manager, context=None):
+        if not inames:
+            return [], []
+
+        base_indices_and_sizes = [
+                cache_manager.base_index_and_length(
+                    self.get_inames_domain(iname), iname, context)
+                for iname in inames]
+        return zip(*base_indices_and_sizes)
+
+    @memoize_method
+    def get_constant_iname_length(self, iname):
+        from loopy.isl_helpers import static_max_of_pw_aff
+        from loopy.symbolic import aff_to_expr
+        return int(aff_to_expr(static_max_of_pw_aff(
+                self.get_iname_bounds(iname).size,
+                constants_only=True)))
+
+    @memoize_method
+    def get_grid_sizes(self, ignore_auto=False):
+        all_inames_by_insns = set()
+        for insn in self.instructions:
+            all_inames_by_insns |= self.insn_inames(insn)
+
+        if not all_inames_by_insns <= self.all_inames():
+            raise RuntimeError("some inames collected from instructions (%s) "
+                    "are not present in domain (%s)"
+                    % (", ".join(sorted(all_inames_by_insns)),
+                        ", ".join(sorted(self.all_inames()))))
+
+        global_sizes = {}
+        local_sizes = {}
+
+        from loopy.kernel.data import (
+                GroupIndexTag, LocalIndexTag,
+                AutoLocalIndexTagBase)
+
+        for iname in self.all_inames():
+            tag = self.iname_to_tag.get(iname)
+
+            if isinstance(tag, GroupIndexTag):
+                tgt_dict = global_sizes
+            elif isinstance(tag, LocalIndexTag):
+                tgt_dict = local_sizes
+            elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
+                raise RuntimeError("cannot find grid sizes if automatic local index tags are "
+                        "present")
+            else:
+                tgt_dict = None
+
+            if tgt_dict is None:
+                continue
+
+            size = self.get_iname_bounds(iname).size
+
+            if tag.axis in tgt_dict:
+                size = tgt_dict[tag.axis].max(size)
+
+            from loopy.isl_helpers import static_max_of_pw_aff
+            try:
+                # insist block size is constant
+                size = static_max_of_pw_aff(size,
+                        constants_only=isinstance(tag, LocalIndexTag))
+            except ValueError:
+                pass
+
+            tgt_dict[tag.axis] = size
+
+        max_dims = self.device.max_work_item_dimensions
+
+        def to_dim_tuple(size_dict, which, forced_sizes={}):
+            forced_sizes = forced_sizes.copy()
+
+            size_list = []
+            sorted_axes = sorted(size_dict.iterkeys())
+
+            while sorted_axes or forced_sizes:
+                if sorted_axes:
+                    cur_axis = sorted_axes.pop(0)
+                else:
+                    cur_axis = None
+
+                if len(size_list) in forced_sizes:
+                    size_list.append(
+                           forced_sizes.pop(len(size_list)))
+                    continue
+
+                assert cur_axis is not None
+
+                if cur_axis > len(size_list):
+                    raise RuntimeError("%s axis %d unused" % (
+                        which, len(size_list)))
+
+                size_list.append(size_dict[cur_axis])
+
+            if len(size_list) > max_dims:
+                raise ValueError("more %s dimensions assigned than supported "
+                        "by hardware (%d > %d)" % (which, len(size_list), max_dims))
+
+            return tuple(size_list)
+
+        return (to_dim_tuple(global_sizes, "global"),
+                to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
+
+    def get_grid_sizes_as_exprs(self, ignore_auto=False):
+        grid_size, group_size = self.get_grid_sizes(ignore_auto=ignore_auto)
+
+        def tup_to_exprs(tup):
+            from loopy.symbolic import pw_aff_to_expr
+            return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
+
+        return tup_to_exprs(grid_size), tup_to_exprs(group_size)
+
+    # }}}
+
+    # {{{ local memory
+
+    @memoize_method
+    def local_var_names(self):
+        return set(
+                tv.name
+            for tv in self.temporary_variables.itervalues()
+            if tv.is_local)
+
+    def local_mem_use(self):
+        return sum(lv.nbytes for lv in self.temporary_variables.itervalues()
+                if lv.is_local)
+
+    # }}}
+
+    # {{{ pretty-printing
+
+    def __str__(self):
+        lines = []
+
+        sep = 75*"-"
+        lines.append(sep)
+        lines.append("INAME-TO-TAG MAP:")
+        for iname in sorted(self.all_inames()):
+            line = "%s: %s" % (iname, self.iname_to_tag.get(iname))
+            lines.append(line)
+
+        lines.append(sep)
+        lines.append("DOMAINS:")
+        for dom, parents in zip(self.domains, self.all_parents_per_domain()):
+            lines.append(len(parents)*"  " + str(dom))
+
+        if self.substitutions:
+            lines.append(sep)
+            lines.append("SUBSTIUTION RULES:")
+            for rule_name in sorted(self.substitutions.iterkeys()):
+                lines.append(str(self.substitutions[rule_name]))
+
+        lines.append(sep)
+        lines.append("INSTRUCTIONS:")
+        loop_list_width = 35
+        for insn in self.instructions:
+            loop_list = ",".join(sorted(self.insn_inames(insn)))
+
+            options = [insn.id]
+            if insn.priority:
+                options.append("priority=%d" % insn.priority)
+
+            if len(loop_list) > loop_list_width:
+                lines.append("[%s]" % loop_list)
+                lines.append("%s%s <- %s   # %s" % (
+                    (loop_list_width+2)*" ", insn.assignee,
+                    insn.expression, ", ".join(options)))
+            else:
+                lines.append("[%s]%s%s <- %s   # %s" % (
+                    loop_list, " "*(loop_list_width-len(loop_list)),
+                    insn.assignee, insn.expression, ", ".join(options)))
+
+        lines.append(sep)
+        lines.append("DEPENDENCIES:")
+        for insn in self.instructions:
+            if insn.insn_deps:
+                lines.append("%s : %s" % (insn.id, ",".join(insn.insn_deps)))
+        lines.append(sep)
+
+        if self.schedule is not None:
+            lines.append("SCHEDULE:")
+            from loopy.schedule import dump_schedule
+            lines.append(dump_schedule(self.schedule))
+            lines.append(sep)
+
+        return "\n".join(lines)
+
+    # }}}
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f7bdb42af60434cb68287a642b0690a1111044
--- /dev/null
+++ b/loopy/kernel/creation.py
@@ -0,0 +1,667 @@
+"""UI for kernel creation."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+
+
+import numpy as np
+from loopy.symbolic import IdentityMapper
+from loopy.kernel.data import Instruction, SubstitutionRule
+import islpy as isl
+from islpy import dim_type
+
+import re
+
+
+# {{{ unique name generation
+
+def generate_unique_possibilities(prefix):
+    yield prefix
+
+    try_num = 0
+    while True:
+        yield "%s_%d" % (prefix, try_num)
+        try_num += 1
+
+class UniqueNameGenerator:
+    def __init__(self, existing_names):
+        self.existing_names = existing_names.copy()
+
+    def is_name_conflicting(self, name):
+        return name in self.existing_names
+
+    def add_name(self, name):
+        if self.is_name_conflicting(name):
+            raise ValueError("name '%s' conflicts with existing names")
+        self.existing_names.add(name)
+
+    def add_names(self, names):
+        for name in names:
+            self.add_name(name)
+
+    def __call__(self, based_on="var"):
+        for var_name in generate_unique_possibilities(based_on):
+            if not self.is_name_conflicting(var_name):
+                break
+
+        self.existing_names.add(var_name)
+        return var_name
+
+_IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")
+
+def _gather_identifiers(s):
+    return set(_IDENTIFIER_RE.findall(s))
+
+class MakeUnique:
+    """A tag for a string that identifies a partial identifier that is to
+    be made unique by the UI.
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+# }}}
+
+# {{{ domain parsing
+
+def parse_domains(ctx, args_and_vars, domains, defines):
+    result = []
+    available_parameters = args_and_vars.copy()
+    used_inames = set()
+
+    for dom in domains:
+        if isinstance(dom, str):
+            dom, = expand_defines(dom, defines)
+
+            if not dom.lstrip().startswith("["):
+                # i.e. if no parameters are already given
+                ids = _gather_identifiers(dom)
+                parameters = ids & available_parameters
+                dom = "[%s] -> %s" % (",".join(parameters), dom)
+
+            try:
+                dom = isl.BasicSet.read_from_str(ctx, dom)
+            except:
+                print "failed to parse domain '%s'" % dom
+                raise
+        else:
+            assert isinstance(dom, (isl.Set, isl.BasicSet))
+            # assert dom.get_ctx() == ctx
+
+        for i_iname in xrange(dom.dim(dim_type.set)):
+            iname = dom.get_dim_name(dim_type.set, i_iname)
+
+            if iname is None:
+                raise RuntimeError("domain '%s' provided no iname at index "
+                        "%d (redefined iname?)" % (dom, i_iname))
+
+            if iname in used_inames:
+                raise RuntimeError("domain '%s' redefines iname '%s' "
+                        "that is part of a previous domain" % (dom, iname))
+
+            used_inames.add(iname)
+            available_parameters.add(iname)
+
+        result.append(dom)
+
+    return result
+
+# }}}
+
+# {{{ expand defines
+
+WORD_RE = re.compile(r"\b([a-zA-Z0-9_]+)\b")
+BRACE_RE = re.compile(r"\$\{([a-zA-Z0-9_]+)\}")
+
+def expand_defines(insn, defines, single_valued=True):
+    replacements = [()]
+
+    for find_regexp, replace_pattern in [
+            (BRACE_RE, r"\$\{%s\}"),
+            (WORD_RE, r"\b%s\b"),
+            ]:
+
+        for match in find_regexp.finditer(insn):
+            word = match.group(1)
+
+            try:
+                value = defines[word]
+            except KeyError:
+                continue
+
+            if isinstance(value, list):
+                if single_valued:
+                    raise ValueError("multi-valued macro expansion not allowed "
+                            "in this context (when expanding '%s')" % word)
+
+                replacements = [
+                        rep+((replace_pattern % word, subval),)
+                        for rep in replacements
+                        for subval in value
+                        ]
+            else:
+                replacements = [
+                        rep+((replace_pattern % word, value),)
+                        for rep in replacements]
+
+    for rep in replacements:
+        rep_value = insn
+        for pattern, val in rep:
+            rep_value = re.sub(pattern, str(val), rep_value)
+
+        yield rep_value
+
+def expand_defines_in_expr(expr, defines):
+    from pymbolic.primitives import Variable
+    from loopy.symbolic import parse
+
+    def subst_func(var):
+        if isinstance(var, Variable):
+            try:
+                var_value = defines[var.name]
+            except KeyError:
+                return None
+            else:
+                return parse(str(var_value))
+        else:
+            return None
+
+    from loopy.symbolic import SubstitutionMapper
+    return SubstitutionMapper(subst_func)(expr)
+
+# }}}
+
+# {{{ parse instructions
+
+INSN_RE = re.compile(
+        "\s*(?:\<(?P<temp_var_type>.*?)\>)?"
+        "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
+        "\s*?(?:\{(?P<options>[\s\w=,:]+)\}\s*)?$"
+        )
+SUBST_RE = re.compile(
+        r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$"
+        )
+
+def parse_insn(insn):
+    insn_match = INSN_RE.match(insn)
+    subst_match = SUBST_RE.match(insn)
+    if insn_match is not None and subst_match is not None:
+        raise RuntimeError("instruction parse error: %s" % insn)
+
+    if insn_match is not None:
+        groups = insn_match.groupdict()
+    elif subst_match is not None:
+        groups = subst_match.groupdict()
+    else:
+        raise RuntimeError("insn parse error")
+
+    from loopy.symbolic import parse
+    lhs = parse(groups["lhs"])
+    rhs = parse(groups["rhs"])
+
+    if insn_match is not None:
+        insn_deps = set()
+        insn_id = None
+        priority = 0
+
+        if groups["options"] is not None:
+            for option in groups["options"].split(","):
+                option = option.strip()
+                if not option:
+                    raise RuntimeError("empty option supplied")
+
+                equal_idx = option.find("=")
+                if equal_idx == -1:
+                    opt_key = option
+                    opt_value = None
+                else:
+                    opt_key = option[:equal_idx].strip()
+                    opt_value = option[equal_idx+1:].strip()
+
+                if opt_key == "id":
+                    insn_id = opt_value
+                elif opt_key == "priority":
+                    priority = int(opt_value)
+                elif opt_key == "dep":
+                    insn_deps = set(opt_value.split(":"))
+                else:
+                    raise ValueError("unrecognized instruction option '%s'"
+                            % opt_key)
+
+        if groups["temp_var_type"] is not None:
+            if groups["temp_var_type"]:
+                temp_var_type = np.dtype(groups["temp_var_type"])
+            else:
+                from loopy import infer_type
+                temp_var_type = infer_type
+        else:
+            temp_var_type = None
+
+        from pymbolic.primitives import Variable, Subscript
+        if not isinstance(lhs, (Variable, Subscript)):
+            raise RuntimeError("left hand side of assignment '%s' must "
+                    "be variable or subscript" % lhs)
+
+        return Instruction(
+                    id=insn_id,
+                    insn_deps=insn_deps,
+                    forced_iname_deps=frozenset(),
+                    assignee=lhs, expression=rhs,
+                    temp_var_type=temp_var_type,
+                    priority=priority)
+
+    elif subst_match is not None:
+        from pymbolic.primitives import Variable, Call
+
+        if isinstance(lhs, Variable):
+            subst_name = lhs.name
+            arg_names = []
+        elif isinstance(lhs, Call):
+            if not isinstance(lhs.function, Variable):
+                raise RuntimeError("Invalid substitution rule left-hand side")
+            subst_name = lhs.function.name
+            arg_names = []
+
+            for i, arg in enumerate(lhs.parameters):
+                if not isinstance(arg, Variable):
+                    raise RuntimeError("Invalid substitution rule "
+                                    "left-hand side: %s--arg number %d "
+                                    "is not a variable"% (lhs, i))
+                arg_names.append(arg.name)
+        else:
+            raise RuntimeError("Invalid substitution rule left-hand side")
+
+        return SubstitutionRule(
+                name=subst_name,
+                arguments=tuple(arg_names),
+                expression=rhs)
+
+def parse_if_necessary(insn, defines):
+    if isinstance(insn, Instruction):
+        yield insn
+        return
+    elif not isinstance(insn, str):
+        raise TypeError("Instructions must be either an Instruction "
+                "instance or a parseable string. got '%s' instead."
+                % type(insn))
+
+    for insn in insn.split("\n"):
+        comment_start = insn.find("#")
+        if comment_start >= 0:
+            insn = insn[:comment_start]
+
+        insn = insn.strip()
+        if not insn:
+            continue
+
+        for sub_insn in expand_defines(insn, defines, single_valued=False):
+            yield parse_insn(sub_insn)
+
+# }}}
+
+# {{{ tag reduction inames as sequential
+
+def tag_reduction_inames_as_sequential(knl):
+    result = set()
+
+    def map_reduction(red_expr, rec):
+        rec(red_expr.expr)
+        result.update(red_expr.inames)
+
+    from loopy.symbolic import ReductionCallbackMapper
+    for insn in knl.instructions:
+        ReductionCallbackMapper(map_reduction)(insn.expression)
+
+    from loopy.kernel.data import ParallelTag, ForceSequentialTag
+
+    new_iname_to_tag = {}
+    for iname in result:
+        tag = knl.iname_to_tag.get(iname)
+        if tag is not None and isinstance(tag, ParallelTag):
+            raise RuntimeError("inconsistency detected: "
+                    "reduction iname '%s' has "
+                    "a parallel tag" % iname)
+
+        if tag is None:
+            new_iname_to_tag[iname] = ForceSequentialTag()
+
+    from loopy import tag_inames
+    return tag_inames(knl, new_iname_to_tag)
+
+# }}}
+
+# {{{ sanity checking
+
+def check_for_duplicate_names(knl):
+    name_to_source = {}
+
+    def add_name(name, source):
+        if name in name_to_source:
+            raise RuntimeError("invalid %s name '%s'--name already used as "
+                    "%s" % (source, name, name_to_source[name]))
+
+        name_to_source[name] = source
+
+    for name in knl.all_inames():
+        add_name(name, "iname")
+    for arg in knl.args:
+        add_name(arg.name, "argument")
+    for name in knl.temporary_variables:
+        add_name(name, "temporary")
+    for name in knl.substitutions:
+        add_name(name, "substitution")
+
+def check_for_nonexistent_iname_deps(knl):
+    for insn in knl.instructions:
+        if not set(insn.forced_iname_deps) <= knl.all_inames():
+            raise ValueError("In instruction '%s': "
+                    "cannot force dependency on inames '%s'--"
+                    "they don't exist" % (
+                        insn.id,
+                        ",".join(
+                            set(insn.forced_iname_deps)-knl.all_inames())))
+
+def check_for_multiple_writes_to_loop_bounds(knl):
+    from islpy import dim_type
+
+    domain_parameters = set()
+    for dom in knl.domains:
+        domain_parameters.update(dom.get_space().get_var_dict(dim_type.param))
+
+    temp_var_domain_parameters = domain_parameters & set(
+            knl.temporary_variables)
+
+    wmap = knl.writer_map()
+    for tvpar in temp_var_domain_parameters:
+        par_writers = wmap[tvpar]
+        if len(par_writers) != 1:
+            raise RuntimeError("there must be exactly one write to data-dependent "
+                    "domain parameter '%s' (found %d)" % (tvpar, len(par_writers)))
+
+
+def check_written_variable_names(knl):
+    admissible_vars = (
+            set(arg.name for arg in knl.args)
+            | set(knl.temporary_variables.iterkeys()))
+
+    for insn in knl.instructions:
+        var_name = insn.get_assignee_var_name()
+
+        if var_name not in admissible_vars:
+            raise RuntimeError("variable '%s' not declared or not "
+                    "allowed for writing" % var_name)
+
+# }}}
+
+# {{{ expand common subexpressions into assignments
+
+class CSEToAssignmentMapper(IdentityMapper):
+    def __init__(self, add_assignment):
+        self.add_assignment = add_assignment
+        self.expr_to_var = {}
+
+    def map_common_subexpression(self, expr):
+        try:
+            return self.expr_to_var[expr.child]
+        except KeyError:
+            from loopy.symbolic import TypedCSE
+            if isinstance(expr, TypedCSE):
+                dtype = expr.dtype
+            else:
+                dtype = None
+
+            child = self.rec(expr.child)
+            from pymbolic.primitives import Variable
+            if isinstance(child, Variable):
+                return child
+
+            var_name = self.add_assignment(expr.prefix, child, dtype)
+            var = Variable(var_name)
+            self.expr_to_var[expr.child] = var
+            return var
+
+def expand_cses(knl):
+    def add_assignment(base_name, expr, dtype):
+        if base_name is None:
+            base_name = "var"
+
+        new_var_name = var_name_gen(base_name)
+
+        if dtype is None:
+            from loopy import infer_type
+            dtype = infer_type
+        else:
+            dtype=np.dtype(dtype)
+
+        from loopy.kernel import TemporaryVariable
+        new_temp_vars[new_var_name] = TemporaryVariable(
+                name=new_var_name,
+                dtype=dtype,
+                is_local=None,
+                shape=())
+
+        from pymbolic.primitives import Variable
+        insn = Instruction(
+                id=knl.make_unique_instruction_id(extra_used_ids=newly_created_insn_ids),
+                assignee=Variable(new_var_name), expression=expr)
+        newly_created_insn_ids.add(insn.id)
+        new_insns.append(insn)
+
+        return new_var_name
+
+    cseam = CSEToAssignmentMapper(add_assignment=add_assignment)
+
+    new_insns = []
+
+    var_name_gen = knl.get_var_name_generator()
+
+    newly_created_insn_ids = set()
+    new_temp_vars = knl.temporary_variables.copy()
+
+    for insn in knl.instructions:
+        new_insns.append(insn.copy(expression=cseam(insn.expression)))
+
+    return knl.copy(
+            instructions=new_insns,
+            temporary_variables=new_temp_vars)
+
+# }}}
+
+# {{{ temporary variable creation
+
+def create_temporaries(knl):
+    new_insns = []
+    new_temp_vars = knl.temporary_variables.copy()
+
+    for insn in knl.instructions:
+        from loopy.kernel.data import TemporaryVariable
+
+        if insn.temp_var_type is not None:
+            assignee_name = insn.get_assignee_var_name()
+
+            assignee_indices = []
+            from pymbolic.primitives import Variable
+            for index_expr in insn.get_assignee_indices():
+                if (not isinstance(index_expr, Variable)
+                        or not index_expr.name in knl.all_inames()):
+                    raise RuntimeError(
+                            "only plain inames are allowed in "
+                            "the lvalue index when declaring the "
+                            "variable '%s' in an instruction"
+                            % assignee_name)
+
+                assignee_indices.append(index_expr.name)
+
+            base_indices, shape = \
+                    knl.find_var_base_indices_and_shape_from_inames(
+                            assignee_indices, knl.cache_manager)
+
+            if assignee_name in new_temp_vars:
+                raise RuntimeError("cannot create temporary variable '%s'--"
+                        "already exists" % assignee_name)
+            if assignee_name in knl.arg_dict:
+                raise RuntimeError("cannot create temporary variable '%s'--"
+                        "already exists as argument" % assignee_name)
+
+            new_temp_vars[assignee_name] = TemporaryVariable(
+                    name=assignee_name,
+                    dtype=insn.temp_var_type,
+                    is_local=None,
+                    base_indices=base_indices,
+                    shape=shape)
+
+            insn = insn.copy(temp_var_type=None)
+
+        new_insns.append(insn)
+
+    return knl.copy(
+            instructions=new_insns,
+            temporary_variables=new_temp_vars)
+
+# }}}
+
+# {{{ check for reduction iname duplication
+
+def check_for_reduction_inames_duplication_requests(kernel):
+
+    # {{{ helper function
+
+    def check_reduction_inames(reduction_expr, rec):
+        for iname in reduction_expr.inames:
+            if iname.startswith("@"):
+                raise RuntimeError("Reduction iname duplication with '@' is no "
+                        "longer supported. Use loopy.duplicate_inames instead.")
+
+    # }}}
+
+
+    from loopy.symbolic import ReductionCallbackMapper
+    rcm = ReductionCallbackMapper(check_reduction_inames)
+    for insn in kernel.instructions:
+        rcm(insn.expression)
+
+    for sub_name, sub_rule in kernel.substitutions.iteritems():
+        rcm(sub_rule.expression)
+
+# }}}
+
+# {{{ kernel creation top-level
+
+def make_kernel(device, domains, instructions, kernel_args=[], *args, **kwargs):
+    """User-facing kernel creation entrypoint."""
+
+    for forbidden_kwarg in [
+            "substitutions",
+            "iname_slab_increments",
+            "applied_iname_rewrites",
+            "cache_manager",
+            "isl_context",
+            ]:
+        if forbidden_kwarg in kwargs:
+            raise RuntimeError("'%s' is not part of user-facing interface"
+                    % forbidden_kwarg)
+
+    defines = kwargs.get("defines", {})
+    temporary_variables = kwargs.get("temporary_variables", {})
+
+    # {{{ instruction/subst parsing
+
+    parsed_instructions = []
+    kwargs["substitutions"] = substitutions = {}
+
+    if isinstance(instructions, str):
+        instructions = [instructions]
+    for insn in instructions:
+        for new_insn in parse_if_necessary(insn, defines):
+            if isinstance(new_insn, Instruction):
+                parsed_instructions.append(new_insn)
+            elif isinstance(new_insn, SubstitutionRule):
+                substitutions[new_insn.name] = new_insn
+            else:
+                raise RuntimeError("unexpected type in instruction parsing")
+
+    instructions = parsed_instructions
+    del parsed_instructions
+
+    # }}}
+
+    # Ordering dependency:
+    # Domain construction needs to know what temporary variables are
+    # available. That information can only be obtained once instructions
+    # are parsed.
+
+    # {{{ parse domains
+
+    if isinstance(domains, str):
+        domains = [domains]
+
+    isl_context = None
+    for domain in domains:
+        if isinstance(domain, isl.BasicSet):
+            isl_context = domain.get_ctx()
+    if isl_context is None:
+        isl_context = isl.Context()
+
+    from loopy.kernel.data import ValueArg
+    scalar_arg_names = set(arg.name for arg in kernel_args if isinstance(arg, ValueArg))
+    var_names = (
+            set(temporary_variables)
+            | set(insn.get_assignee_var_name()
+                for insn in instructions
+                if insn.temp_var_type is not None))
+    domains = parse_domains(isl_context, scalar_arg_names | var_names, domains,
+            defines)
+
+    kwargs["isl_context"] = isl_context
+
+    # }}}
+
+    from loopy.kernel import LoopKernel
+    knl = LoopKernel(device, domains, instructions, kernel_args, *args, **kwargs)
+
+    check_for_nonexistent_iname_deps(knl)
+    check_for_reduction_inames_duplication_requests(knl)
+
+    knl = tag_reduction_inames_as_sequential(knl)
+    knl = create_temporaries(knl)
+    knl = expand_cses(knl)
+
+    # -------------------------------------------------------------------------
+    # Ordering dependency:
+    # -------------------------------------------------------------------------
+    # Must create temporaries before checking for writes to temporary variables
+    # that are domain parameters.
+    # -------------------------------------------------------------------------
+
+    check_for_multiple_writes_to_loop_bounds(knl)
+    check_for_duplicate_names(knl)
+    check_written_variable_names(knl)
+
+    return knl
+
+# }}}
+
+# vim: fdm=marker
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..80e53b17d541bbba399f452bc266e2d808d3162d
--- /dev/null
+++ b/loopy/kernel/data.py
@@ -0,0 +1,559 @@
+"""Data used by the kernel object."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+from pytools import Record, memoize_method
+
+
+
+
+# {{{ index tags
+
+class IndexTag(Record):
+    __slots__ = []
+
+    def __hash__(self):
+        raise RuntimeError("use .key to hash index tags")
+
+
+
+
+class ParallelTag(IndexTag):
+    pass
+
+class HardwareParallelTag(ParallelTag):
+    pass
+
+class UniqueTag(IndexTag):
+    @property
+    def key(self):
+        return type(self)
+
+class AxisTag(UniqueTag):
+    __slots__ = ["axis"]
+
+    def __init__(self, axis):
+        Record.__init__(self,
+                axis=axis)
+
+    @property
+    def key(self):
+        return (type(self), self.axis)
+
+    def __str__(self):
+        return "%s.%d" % (
+                self.print_name, self.axis)
+
+class GroupIndexTag(HardwareParallelTag, AxisTag):
+    print_name = "g"
+
+class LocalIndexTagBase(HardwareParallelTag):
+    pass
+
+class LocalIndexTag(LocalIndexTagBase, AxisTag):
+    print_name = "l"
+
+class AutoLocalIndexTagBase(LocalIndexTagBase):
+    pass
+
+class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
+    def __str__(self):
+        return "l.auto"
+
+class IlpBaseTag(ParallelTag):
+    pass
+
+class UnrolledIlpTag(IlpBaseTag):
+    def __str__(self):
+        return "ilp.unr"
+
+class LoopedIlpTag(IlpBaseTag):
+    def __str__(self):
+        return "ilp.seq"
+
+class UnrollTag(IndexTag):
+    def __str__(self):
+        return "unr"
+
+class ForceSequentialTag(IndexTag):
+    def __str__(self):
+        return "forceseq"
+
+def parse_tag(tag):
+    if tag is None:
+        return tag
+
+    if isinstance(tag, IndexTag):
+        return tag
+
+    if not isinstance(tag, str):
+        raise ValueError("cannot parse tag: %s" % tag)
+
+    if tag == "for":
+        return None
+    elif tag in ["unr"]:
+        return UnrollTag()
+    elif tag in ["ilp", "ilp.unr"]:
+        return UnrolledIlpTag()
+    elif tag == "ilp.seq":
+        return LoopedIlpTag()
+    elif tag.startswith("g."):
+        return GroupIndexTag(int(tag[2:]))
+    elif tag.startswith("l."):
+        axis = tag[2:]
+        if axis == "auto":
+            return AutoFitLocalIndexTag()
+        else:
+            return LocalIndexTag(int(axis))
+    else:
+        raise ValueError("cannot parse tag: %s" % tag)
+
+# }}}
+
+# {{{ arguments
+
+class ShapedArg(Record):
+    def __init__(self, name, dtype=None, shape=None, strides=None, order="C",
+            offset=0):
+        """
+        All of the following are optional. Specify either strides or shape.
+
+        :arg shape:
+        :arg strides: like numpy strides, but in multiples of
+            data type size
+        :arg order:
+        :arg offset: Offset from the beginning of the vector from which
+            the strides are counted.
+        """
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+
+        def parse_if_necessary(x):
+            if isinstance(x, str):
+                from pymbolic import parse
+                return parse(x)
+            else:
+                return x
+
+        def process_tuple(x):
+            x = parse_if_necessary(x)
+            if not isinstance(x, tuple):
+                x = (x,)
+
+            return tuple(parse_if_necessary(xi) for xi in x)
+
+        if strides is not None:
+            strides = process_tuple(strides)
+
+        if shape is not None:
+            shape = process_tuple(shape)
+
+        if strides is None and shape is not None:
+            from pyopencl.compyte.array import (
+                    f_contiguous_strides,
+                    c_contiguous_strides)
+
+            if order == "F":
+                strides = f_contiguous_strides(1, shape)
+            elif order == "C":
+                strides = c_contiguous_strides(1, shape)
+            else:
+                raise ValueError("invalid order: %s" % order)
+
+        Record.__init__(self,
+                name=name,
+                dtype=dtype,
+                strides=strides,
+                offset=offset,
+                shape=shape)
+
+    @property
+    @memoize_method
+    def numpy_strides(self):
+        return tuple(self.dtype.itemsize*s for s in self.strides)
+
+    @property
+    def dimensions(self):
+        return len(self.shape)
+
+class GlobalArg(ShapedArg):
+    def __repr__(self):
+        return "<GlobalArg '%s' of type %s and shape (%s)>" % (
+                self.name, self.dtype, ",".join(str(i) for i in self.shape))
+
+class ArrayArg(GlobalArg):
+    def __init__(self, *args, **kwargs):
+        from warnings import warn
+        warn("ArrayArg is a deprecated name of GlobalArg", DeprecationWarning,
+                stacklevel=2)
+        GlobalArg.__init__(self, *args, **kwargs)
+
+class ConstantArg(ShapedArg):
+    def __repr__(self):
+        return "<ConstantArg '%s' of type %s and shape (%s)>" % (
+                self.name, self.dtype, ",".join(str(i) for i in self.shape))
+
+class ImageArg(Record):
+    def __init__(self, name, dtype=None, dimensions=None, shape=None):
+        dtype = np.dtype(dtype)
+        if shape is not None:
+            if dimensions is not None and dimensions != len(shape):
+                raise RuntimeError("cannot specify both shape and "
+                        "disagreeing dimensions in ImageArg")
+            dimensions = len(shape)
+        else:
+            if not isinstance(dimensions, int):
+                raise RuntimeError("ImageArg: dimensions must be an integer")
+
+        Record.__init__(self,
+                dimensions=dimensions,
+                shape=shape,
+                dtype=dtype,
+                name=name)
+
+
+    def __repr__(self):
+        return "<ImageArg '%s' of type %s>" % (self.name, self.dtype)
+
+class ValueArg(Record):
+    def __init__(self, name, dtype=None, approximately=None):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+
+        Record.__init__(self, name=name, dtype=dtype,
+                approximately=approximately)
+
+    def __repr__(self):
+        return "<ValueArg '%s' of type %s>" % (self.name, self.dtype)
+
+class ScalarArg(ValueArg):
+    def __init__(self, name, dtype=None, approximately=None):
+        from warnings import warn
+        warn("ScalarArg is a deprecated name of ValueArg",
+                DeprecationWarning, stacklevel=2)
+
+        ValueArg.__init__(self, name, dtype, approximately)
+
+# }}}
+
+# {{{ temporary variable
+
+class TemporaryVariable(Record):
+    """
+    :ivar name:
+    :ivar dtype:
+    :ivar shape:
+    :ivar storage_shape:
+    :ivar base_indices:
+    :ivar is_local:
+    """
+
+    def __init__(self, name, dtype, shape, is_local, base_indices=None,
+            storage_shape=None):
+        if base_indices is None:
+            base_indices = (0,) * len(shape)
+
+        if shape is not None and not isinstance(shape, tuple):
+            shape = tuple(shape)
+
+        Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local,
+                base_indices=base_indices,
+                storage_shape=storage_shape)
+
+    @property
+    def nbytes(self):
+        from pytools import product
+        return product(si for si in self.shape)*self.dtype.itemsize
+
+# }}}
+
+# {{{ subsitution rule
+
+class SubstitutionRule(Record):
+    """
+    :ivar name:
+    :ivar arguments:
+    :ivar expression:
+    """
+
+    def __init__(self, name, arguments, expression):
+        assert isinstance(arguments, tuple)
+
+        Record.__init__(self,
+                name=name, arguments=arguments, expression=expression)
+
+    def __str__(self):
+        return "%s(%s) := %s" % (
+                self.name, ", ".join(self.arguments), self.expression)
+
+# }}}
+
+# {{{ instruction
+
+class Instruction(Record):
+    """
+    :ivar id: An (otherwise meaningless) identifier that is unique within
+        a :class:`LoopKernel`.
+    :ivar assignee:
+    :ivar expression:
+    :ivar forced_iname_deps: a set of inames that are added to the list of iname
+        dependencies
+    :ivar insn_deps: a list of ids of :class:`Instruction` instances that
+        *must* be executed before this one. Note that loop scheduling augments this
+        by adding dependencies on any writes to temporaries read by this instruction.
+    :ivar boostable: Whether the instruction may safely be executed
+        inside more loops than advertised without changing the meaning
+        of the program. Allowed values are *None* (for unknown), *True*, and *False*.
+    :ivar boostable_into: a set of inames into which the instruction
+        may need to be boosted, as a heuristic help for the scheduler.
+    :ivar priority: scheduling priority
+
+    The following two instance variables are only used until :func:`loopy.make_kernel` is
+    finished:
+
+    :ivar temp_var_type: if not None, a type that will be assigned to the new temporary variable
+        created from the assignee
+    """
+    def __init__(self,
+            id, assignee, expression,
+            forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
+            boostable_into=None,
+            temp_var_type=None, priority=0):
+
+        from loopy.symbolic import parse
+        if isinstance(assignee, str):
+            assignee = parse(assignee)
+        if isinstance(expression, str):
+            assignee = parse(expression)
+
+        assert isinstance(forced_iname_deps, frozenset)
+        assert isinstance(insn_deps, set)
+
+        Record.__init__(self,
+                id=id, assignee=assignee, expression=expression,
+                forced_iname_deps=forced_iname_deps,
+                insn_deps=insn_deps, boostable=boostable,
+                boostable_into=boostable_into,
+                temp_var_type=temp_var_type,
+                priority=priority)
+
+    @memoize_method
+    def reduction_inames(self):
+        def map_reduction(expr, rec):
+            rec(expr.expr)
+            for iname in expr.inames:
+                result.add(iname)
+
+        from loopy.symbolic import ReductionCallbackMapper
+        cb_mapper = ReductionCallbackMapper(map_reduction)
+
+        result = set()
+        cb_mapper(self.expression)
+
+        return result
+
+    def __str__(self):
+        result = "%s: %s <- %s" % (self.id,
+                self.assignee, self.expression)
+
+        if self.boostable == True:
+            if self.boostable_into:
+                result += " (boostable into '%s')" % ",".join(self.boostable_into)
+            else:
+                result += " (boostable)"
+        elif self.boostable == False:
+            result += " (not boostable)"
+        elif self.boostable is None:
+            pass
+        else:
+            raise RuntimeError("unexpected value for Instruction.boostable")
+
+        options = []
+
+        if self.insn_deps:
+            options.append("deps="+":".join(self.insn_deps))
+        if self.priority:
+            options.append("priority=%d" % self.priority)
+
+        return result
+
+    @memoize_method
+    def get_assignee_var_name(self):
+        from pymbolic.primitives import Variable, Subscript
+
+        if isinstance(self.assignee, Variable):
+            var_name = self.assignee.name
+        elif isinstance(self.assignee, Subscript):
+            agg = self.assignee.aggregate
+            assert isinstance(agg, Variable)
+            var_name = agg.name
+        else:
+            raise RuntimeError("invalid lvalue '%s'" % self.assignee)
+
+        return var_name
+
+    @memoize_method
+    def get_assignee_indices(self):
+        from pymbolic.primitives import Variable, Subscript
+
+        if isinstance(self.assignee, Variable):
+            return ()
+        elif isinstance(self.assignee, Subscript):
+            result = self.assignee.index
+            if not isinstance(result, tuple):
+                result = (result,)
+            return result
+        else:
+            raise RuntimeError("invalid lvalue '%s'" % self.assignee)
+
+    @memoize_method
+    def get_read_var_names(self):
+        from loopy.symbolic import get_dependencies
+        return get_dependencies(self.expression)
+
+# }}}
+
+# {{{ function manglers / dtype getters
+
+def default_function_mangler(name, arg_dtypes):
+    from loopy.reduction import reduction_function_mangler
+
+    manglers = [reduction_function_mangler]
+    for mangler in manglers:
+        result = mangler(name, arg_dtypes)
+        if result is not None:
+            return result
+
+    return None
+
+def opencl_function_mangler(name, arg_dtypes):
+    if name == "atan2" and len(arg_dtypes) == 2:
+        return arg_dtypes[0], name
+
+    if len(arg_dtypes) == 1:
+        arg_dtype, = arg_dtypes
+
+        if arg_dtype.kind == "c":
+            if arg_dtype == np.complex64:
+                tpname = "cfloat"
+            elif arg_dtype == np.complex128:
+                tpname = "cdouble"
+            else:
+                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
+
+            if name in ["sqrt", "exp", "log",
+                    "sin", "cos", "tan",
+                    "sinh", "cosh", "tanh"]:
+                return arg_dtype, "%s_%s" % (tpname, name)
+
+            if name in ["real", "imag"]:
+                return np.dtype(arg_dtype.type(0).real), "%s_%s" % (tpname, name)
+
+    if name == "dot":
+        scalar_dtype, offset, field_name = arg_dtypes[0].fields["s0"]
+        return scalar_dtype, name
+
+    return None
+
+def single_arg_function_mangler(name, arg_dtypes):
+    if len(arg_dtypes) == 1:
+        dtype, = arg_dtypes
+        return dtype, name
+
+    return None
+
+def opencl_symbol_mangler(name):
+    # FIXME: should be more picky about exact names
+    if name.startswith("FLT_"):
+        return np.dtype(np.float32), name
+    elif name.startswith("DBL_"):
+        return np.dtype(np.float64), name
+    elif name.startswith("M_"):
+        if name.endswith("_F"):
+            return np.dtype(np.float32), name
+        else:
+            return np.dtype(np.float64), name
+    else:
+        return None
+
+# }}}
+
+# {{{ preamble generators
+
+def default_preamble_generator(seen_dtypes, seen_functions):
+    from loopy.reduction import reduction_preamble_generator
+
+    for result in reduction_preamble_generator(seen_dtypes, seen_functions):
+        yield result
+
+    has_double = False
+    has_complex = False
+
+    for dtype in seen_dtypes:
+        if dtype in [np.float64, np.complex128]:
+            has_double = True
+        if dtype.kind == "c":
+            has_complex = True
+
+    if has_double:
+        yield ("00_enable_double", """
+            #pragma OPENCL EXTENSION cl_khr_fp64: enable
+            """)
+
+    if has_complex:
+        if has_double:
+            yield ("10_include_complex_header", """
+                #define PYOPENCL_DEFINE_CDOUBLE
+
+                #include <pyopencl-complex.h>
+                """)
+        else:
+            yield ("10_include_complex_header", """
+                #include <pyopencl-complex.h>
+                """)
+
+    c_funcs = set(c_name for name, c_name, arg_dtypes in seen_functions)
+    if "int_floor_div" in c_funcs:
+        yield ("05_int_floor_div", """
+            #define int_floor_div(a,b) \
+              (( (a) - \
+                 ( ( (a)<0 ) != ( (b)<0 )) \
+                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
+               / (b) )
+            """)
+
+    if "int_floor_div_pos_b" in c_funcs:
+        yield ("05_int_floor_div_pos_b", """
+            #define int_floor_div_pos_b(a,b) ( \
+                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
+                )
+            """)
+
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d170d8e9f3906b1ac0b93b56d3342d3a64e7fe
--- /dev/null
+++ b/loopy/kernel/tools.py
@@ -0,0 +1,319 @@
+"""Operations on the kernel object."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+
+
+import numpy as np
+from pytools import Record, memoize_method
+import islpy as isl
+from islpy import dim_type
+
+import re
+
+
+
+
+# {{{ add and infer argument dtypes
+
+def add_argument_dtypes(knl, dtype_dict):
+    dtype_dict = dtype_dict.copy()
+    new_args = []
+
+    for arg in knl.args:
+        new_dtype = dtype_dict.pop(arg.name, None)
+        if new_dtype is not None:
+            new_dtype = np.dtype(new_dtype)
+            if arg.dtype is not None and arg.dtype != new_dtype:
+                raise RuntimeError(
+                        "argument '%s' already has a different dtype "
+                        "(existing: %s, new: %s)"
+                        % (arg.name, arg.dtype, new_dtype))
+            arg = arg.copy(dtype=new_dtype)
+
+        new_args.append(arg)
+
+    knl = knl.copy(args=new_args)
+
+    if dtype_dict:
+        raise RuntimeError("unused argument dtypes: %s"
+                % ", ".join(dtype_dict))
+
+    return knl.copy(args=new_args)
+
+def infer_argument_dtypes(knl):
+    new_args = []
+
+    writer_map = knl.writer_map()
+
+    from loopy.codegen.expression import (
+            TypeInferenceMapper, TypeInferenceFailure)
+    tim = TypeInferenceMapper(knl)
+
+    for arg in knl.args:
+        if arg.dtype is None:
+            new_dtype = None
+
+            if arg.name in knl.all_params():
+                new_dtype = knl.index_dtype
+            else:
+                try:
+                    for write_insn_id in writer_map.get(arg.name, ()):
+                        write_insn = knl.id_to_insn[write_insn_id]
+                        new_tim_dtype = tim(write_insn.expression)
+                        if new_dtype is None:
+                            new_dtype = new_tim_dtype
+                        elif new_dtype != new_tim_dtype:
+                            # Now we know *nothing*.
+                            new_dtype = None
+                            break
+
+                except TypeInferenceFailure:
+                    # Even one type inference failure is enough to
+                    # make this dtype not safe to guess. Don't.
+                    pass
+
+            if new_dtype is not None:
+                arg = arg.copy(dtype=new_dtype)
+
+        new_args.append(arg)
+
+    return knl.copy(args=new_args)
+
+def get_arguments_with_incomplete_dtype(knl):
+    return [arg.name for arg in knl.args
+            if arg.dtype is None]
+
+# }}}
+
+# {{{ find_all_insn_inames fixed point iteration
+
+def find_all_insn_inames(kernel):
+    from loopy.symbolic import get_dependencies
+
+    writer_map = kernel.writer_map()
+
+    insn_id_to_inames = {}
+    insn_assignee_inames = {}
+
+    all_read_deps = {}
+    all_write_deps = {}
+
+    from loopy.subst import expand_subst
+    kernel = expand_subst(kernel)
+
+    for insn in kernel.instructions:
+        all_read_deps[insn.id] = read_deps = get_dependencies(insn.expression)
+        all_write_deps[insn.id] = write_deps = get_dependencies(insn.assignee)
+        deps = read_deps | write_deps
+
+        iname_deps = (
+                deps & kernel.all_inames()
+                | insn.forced_iname_deps)
+
+        insn_id_to_inames[insn.id] = iname_deps
+        insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
+
+    temp_var_names = set(kernel.temporary_variables.iterkeys())
+
+    # fixed point iteration until all iname dep sets have converged
+
+    # Why is fixed point iteration necessary here? Consider the following
+    # scenario:
+    #
+    # z = expr(iname)
+    # y = expr(z)
+    # x = expr(y)
+    #
+    # x clearly has a dependency on iname, but this is not found until that
+    # dependency has propagated all the way up. Doing this recursively is
+    # not guaranteed to terminate because of circular dependencies.
+
+    while True:
+        did_something = False
+        for insn in kernel.instructions:
+
+            # {{{ depdency-based propagation
+
+            # For all variables that insn depends on, find the intersection
+            # of iname deps of all writers, and add those to insn's
+            # dependencies.
+
+            for tv_name in (all_read_deps[insn.id] & temp_var_names):
+                implicit_inames = None
+
+                for writer_id in writer_map[tv_name]:
+                    writer_implicit_inames = (
+                            insn_id_to_inames[writer_id]
+                            - insn_assignee_inames[writer_id])
+                    if implicit_inames is None:
+                        implicit_inames = writer_implicit_inames
+                    else:
+                        implicit_inames = (implicit_inames
+                                & writer_implicit_inames)
+
+                inames_old = insn_id_to_inames[insn.id]
+                inames_new = (inames_old | implicit_inames) \
+                            - insn.reduction_inames()
+                insn_id_to_inames[insn.id] = inames_new
+
+                if inames_new != inames_old:
+                    did_something = True
+
+            # }}}
+
+            # {{{ domain-based propagation
+
+            # Add all inames occurring in parameters of domains that my current
+            # inames refer to.
+
+            inames_old = insn_id_to_inames[insn.id]
+            inames_new = set(insn_id_to_inames[insn.id])
+
+            for iname in inames_old:
+                home_domain = kernel.domains[kernel.get_home_domain_index(iname)]
+
+                for par in home_domain.get_var_names(dim_type.param):
+                    if par in kernel.all_inames():
+                        inames_new.add(par)
+
+            if inames_new != inames_old:
+                did_something = True
+                insn_id_to_inames[insn.id] = frozenset(inames_new)
+
+            # }}}
+
+        if not did_something:
+            break
+
+    return insn_id_to_inames
+
+# }}}
+
+# {{{ set operation cache
+
+class SetOperationCacheManager:
+    def __init__(self):
+        # mapping: set hash -> [(set, op, args, result)]
+        self.cache = {}
+
+    def op(self, set, op_name, op, args):
+        hashval = hash(set)
+        bucket = self.cache.setdefault(hashval, [])
+
+        for bkt_set, bkt_op, bkt_args, result  in bucket:
+            if set.plain_is_equal(bkt_set) and op == bkt_op and args == bkt_args:
+                return result
+
+        #print op, set.get_dim_name(dim_type.set, args[0])
+        result = op(*args)
+        bucket.append((set, op_name, args, result))
+        return result
+
+    def dim_min(self, set, *args):
+        return self.op(set, "dim_min", set.dim_min, args)
+
+    def dim_max(self, set, *args):
+        return self.op(set, "dim_max", set.dim_max, args)
+
+    def base_index_and_length(self, set, iname, context=None):
+        iname_to_dim = set.space.get_var_dict()
+        lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1])
+        upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1])
+
+        from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
+        from loopy.symbolic import pw_aff_to_expr
+
+        size = pw_aff_to_expr(static_max_of_pw_aff(
+                upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True,
+                context=context))
+        base_index = pw_aff_to_expr(
+            static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False,
+                context=context))
+
+        return base_index, size
+
+# }}}
+
+# {{{ domain change helper
+
+class DomainChanger:
+    """Helps change the domain responsible for *inames* within a kernel.
+
+    .. note: Does not perform an in-place change!
+    """
+
+    def __init__(self, kernel, inames):
+        self.kernel = kernel
+        if inames:
+            ldi = kernel.get_leaf_domain_indices(inames)
+            if len(ldi) > 1:
+                raise RuntimeError("Inames '%s' require more than one leaf "
+                        "domain, which makes the domain change that is part "
+                        "of your current operation ambiguous." % ", ".join(inames))
+
+            self.leaf_domain_index, = ldi
+            self.domain = kernel.domains[self.leaf_domain_index]
+
+        else:
+            self.domain = kernel.combine_domains(())
+            self.leaf_domain_index = None
+
+    def get_domains_with(self, replacement):
+        result = self.kernel.domains[:]
+        if self.leaf_domain_index is not None:
+            result[self.leaf_domain_index] = replacement
+        else:
+            result.append(replacement)
+
+        return result
+
+# }}}
+
+# {{{ graphviz / dot export
+
+def get_dot_dependency_graph(kernel, iname_cluster=False, iname_edge=True):
+    lines = []
+    for insn in kernel.instructions:
+        lines.append("%s [shape=\"box\"];" % insn.id)
+        for dep in insn.insn_deps:
+            lines.append("%s -> %s;" % (dep, insn.id))
+
+        if iname_edge:
+            for iname in kernel.insn_inames(insn):
+                lines.append("%s -> %s [style=\"dotted\"];" % (iname, insn.id))
+
+    if iname_cluster:
+        for iname in kernel.all_inames():
+            lines.append("subgraph cluster_%s { label=\"%s\" %s }" % (iname, iname,
+                " ".join(insn.id for insn in kernel.instructions
+                    if iname in kernel.insn_inames(insn))))
+
+    return "digraph loopy_deps {\n%s\n}" % "\n".join(lines)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/padding.py b/loopy/padding.py
index 9fbc4b59c8d54dc1777e78e445363b3293b28be3..7998dd4ba80370c500c6a853e181a4b33332f778 100644
--- a/loopy/padding.py
+++ b/loopy/padding.py
@@ -79,7 +79,7 @@ def split_arg_axis(kernel, args_and_axes, count):
     if len(args_and_axes) != len(arg_to_rest):
         raise RuntimeError("cannot split multiple axes of the same variable")
 
-    from loopy.kernel import GlobalArg
+    from loopy.kernel.data import GlobalArg
     for arg_name in arg_to_rest:
         if not isinstance(kernel.arg_dict[arg_name], GlobalArg):
             raise RuntimeError("only GlobalArg axes may be split")
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 98807b3b3d39e90c41576883658635b8fee7470c..f67b25e1fe8fb5cfb545d251e39ea046d428938b 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -87,7 +87,7 @@ def infer_types_of_temporaries(kernel):
 
     # {{{ work on type inference queue
 
-    from loopy.kernel import TemporaryVariable
+    from loopy.kernel.data import TemporaryVariable
 
     debug = 0
 
@@ -137,7 +137,7 @@ def infer_types_of_temporaries(kernel):
 
 def mark_local_temporaries(kernel):
     new_temp_vars = {}
-    from loopy.kernel import LocalIndexTagBase
+    from loopy.kernel.data import LocalIndexTagBase
 
     writers = kernel.writer_map()
 
@@ -246,9 +246,8 @@ def realize_reduction(kernel, insn_id_filter=None):
 
         arg_dtype = type_inf_mapper(expr.expr)
 
-        from loopy.kernel import Instruction
+        from loopy.kernel.data import Instruction, TemporaryVariable
 
-        from loopy.kernel import TemporaryVariable
         new_temporary_variables[target_var_name] = TemporaryVariable(
                 name=target_var_name,
                 shape=(),
@@ -369,7 +368,7 @@ class ExtraInameIndexInserter(IdentityMapper):
 def duplicate_private_temporaries_for_ilp(kernel):
     wmap = kernel.writer_map()
 
-    from loopy.kernel import IlpBaseTag
+    from loopy.kernel.data import IlpBaseTag
     from loopy.symbolic import get_dependencies
 
     var_to_new_ilp_inames = {}
@@ -591,7 +590,7 @@ def limit_boostability(kernel):
 # {{{ rank inames by stride
 
 def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel import ImageArg, ValueArg
+    from loopy.kernel.data import ImageArg, ValueArg
 
     approximate_arg_values = dict(
             (arg.name, arg.approximately)
@@ -629,7 +628,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     # {{{ figure out automatic-axis inames
 
-    from loopy.kernel import AutoLocalIndexTagBase
+    from loopy.kernel.data import AutoLocalIndexTagBase
     auto_axis_inames = set(
             iname
             for iname in kernel.insn_inames(insn)
@@ -690,7 +689,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # {{{ assign automatic axes
 
 def assign_automatic_axes(kernel, axis=0, local_size=None):
-    from loopy.kernel import (AutoLocalIndexTagBase, LocalIndexTag)
+    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
 
     # Realize that at this point in time, axis lengths are already
     # fixed. So we compute them once and pass them to our recursive
diff --git a/loopy/schedule.py b/loopy/schedule.py
index dcbc1fec922dc12e577a817ad60a406b676e6089..944eef8d9d3ac752349f54061b16b4d99f21bb6e 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -72,7 +72,7 @@ def gather_schedule_subloop(schedule, start_idx):
 
 
 def get_barrier_needing_dependency(kernel, target, source, unordered=False):
-    from loopy.kernel import Instruction
+    from loopy.kernel.data import Instruction
     if not isinstance(source, Instruction):
         source = kernel.id_to_insn[source]
     if not isinstance(target, Instruction):
@@ -194,7 +194,7 @@ def loop_nest_map(kernel):
     iname_to_insns = kernel.iname_to_insns()
 
     # examine pairs of all inames--O(n**2), I know.
-    from loopy.kernel import IlpBaseTag
+    from loopy.kernel.data import IlpBaseTag
     for inner_iname in all_inames:
         result[inner_iname] = set()
         for outer_iname in kernel.all_inames():
@@ -798,7 +798,7 @@ def generate_loop_schedules(kernel, loop_priority=[], debug_args={}):
 
     debug = ScheduleDebugger(**debug_args)
 
-    from loopy.kernel import IlpBaseTag, ParallelTag
+    from loopy.kernel.data import IlpBaseTag, ParallelTag
     ilp_inames = set(
             iname
             for iname in kernel.all_inames()
diff --git a/loopy/subst.py b/loopy/subst.py
index 31c48dddb371a2d7be318b8c5efb29b46a8f27e1..f45226cf8b0a3a2d5bbfa8c53952f9305c9826c8 100644
--- a/loopy/subst.py
+++ b/loopy/subst.py
@@ -147,7 +147,7 @@ def extract_subst(kernel, subst_name, template, parameters):
         new_expr = cbmapper(insn.expression)
         new_insns.append(insn.copy(expression=new_expr))
 
-    from loopy.kernel import SubstitutionRule
+    from loopy.kernel.data import SubstitutionRule
     new_substs = {
             subst_name: SubstitutionRule(
                 name=subst_name,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 6ada11f181d9002c7eb55fe7683addcd7998d9a3..460e7b742555790ce5de153e2441e189efcd7f8f 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -420,7 +420,7 @@ class ExpandingIdentityMapper(IdentityMapper):
         :returns: (new_substitutions, subst_renames)
         """
 
-        from loopy.kernel import SubstitutionRule
+        from loopy.kernel.data import SubstitutionRule
 
         orig_name_histogram = {}
         for key, (name, orig_name) in self.subst_rule_registry.iteritems():