diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index d085c1215781f52a8f0c79359e8ac7061d94ffb7..386fbc18a2c5089486aba51395d671b4c9a600ff 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -112,7 +112,10 @@ Finishing up
 
 .. autofunction:: get_one_scheduled_kernel
 
-.. autofunction:: generate_code
+.. autoclass:: GeneratedProgram
+.. autoclass:: CodeGenerationResult
+
+.. autofunction:: generate_code_v2
 
 Setting options
 ---------------
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index cde36163d09a5a7d31a1afc97e0bfcaa636cdda3..f6e7ad9c2211d24582e5027777b584fc5ac64d98 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -721,7 +721,7 @@ those for us:
 
 .. doctest::
 
-    >>> glob, loc = knl.get_grid_sizes()
+    >>> glob, loc = knl.get_grid_size_upper_bounds()
     >>> print(glob)
     (Aff("[n] -> { [(floor((127 + n)/128))] }"),)
     >>> print(loc)
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index ecb45edbf0af51bdcbb1efcbbee8e18e0f3b500f..b71c561195647b7e35bc5538cfd1d601e270f3c2 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -5,14 +5,16 @@ import pyopencl.array
 
 knl = lp.make_kernel(
         "{ [i,k]: 0<=i<n and 0<=k<3 }",
-        """c[k,i] = a[k, i + 1]
-           out[k,i] = c[k,i]""",
-        ["..."])
+        """
+        c[k,i] = a[k, i + 1]
+        out[k,i] = c[k,i]
+        """)
 
 # transform
 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 from loopy.kernel.tools import add_dtypes
-knl = add_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+knl = add_dtypes(knl,
+        {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
 
 # schedule
 from loopy.preprocess import preprocess_kernel
@@ -20,9 +22,11 @@ knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_scheduled_kernel
 knl = get_one_scheduled_kernel(knl)
-print(knl)
 
 # map schedule onto host or device
-from loopy.codegen.device_mapping import map_schedule_onto_host_or_device
-knl = map_schedule_onto_host_or_device(knl)
 print(knl)
+
+cgr = lp.generate_code_v2(knl)
+
+print(cgr.device_code())
+print(cgr.host_code())
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index a402896c85baa49bc23bb5d770607cf73641c273..fa581d4262e2f06addf81aeaecca5ed2f8f8c8f1 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -31,9 +31,9 @@ def transform(knl, vars, stream_dtype):
 def gen_code(knl):
     knl = lp.preprocess_kernel(knl)
     knl = lp.get_one_scheduled_kernel(knl)
-    ispc_code, arg_info = lp.generate_code(knl)
+    codegen_result = lp.generate_code_v2(knl)
 
-    return ispc_code
+    return codegen_result.device_code() + "\n" + codegen_result.host_code()
 
 
 NRUNS = 10
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 10bd1100126e79ff2451eaf96c45bd4be47adf3a..8562df4118534392cf66ada250aacdfdbc0e1917 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -110,7 +110,11 @@ from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
         get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
         sum_mem_access_to_bytes,
         gather_access_footprints, gather_access_footprint_bytes)
-from loopy.codegen import generate_code, generate_body
+from loopy.codegen import (
+        generate_code, generate_code_v2, generate_body)
+from loopy.codegen.result import (
+        GeneratedProgram,
+        CodeGenerationResult)
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
 from loopy.auto_test import auto_test_vs_ref
@@ -196,7 +200,8 @@ __all__ = [
 
         "preprocess_kernel", "realize_reduction", "infer_unknown_types",
         "generate_loop_schedules", "get_one_scheduled_kernel",
-        "generate_code", "generate_body",
+        "GeneratedProgram", "CodeGenerationResult",
+        "generate_code", "generate_code_v2", "generate_body",
 
         "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
         "get_DRAM_access_poly",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index f34a943831c68bd0d2cf6089d8b70ad4bc43022e..0adf4416d44e199f1b329205a3159736acfa3fa3 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -442,7 +442,8 @@ def auto_test_vs_ref(
 
         try:
             ref_args, ref_arg_data = \
-                    make_ref_args(ref_sched_kernel, ref_cl_kernel_info.impl_arg_info,
+                    make_ref_args(ref_sched_kernel,
+                            ref_cl_kernel_info.implemented_data_info,
                             ref_queue, parameters)
             ref_args["out_host"] = False
         except cl.RuntimeError as e:
@@ -529,7 +530,8 @@ def auto_test_vs_ref(
         if args is None:
             cl_kernel_info = compiled.cl_kernel_info(frozenset())
 
-            args = make_args(kernel, cl_kernel_info.impl_arg_info,
+            args = make_args(kernel,
+                    cl_kernel_info.implemented_data_info,
                     queue, ref_arg_data, parameters)
         args["out_host"] = False
 
diff --git a/loopy/check.py b/loopy/check.py
index 5ae22cac5e5af8b727ae0ba577fc05a8e571b8a7..0ef3d27cfeea142e905dbb38aaaad9afbe06ebcf 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -60,7 +60,8 @@ def check_loop_priority_inames_known(kernel):
 
 
 def check_for_unused_hw_axes_in_insns(kernel):
-    group_size, local_size = kernel.get_grid_sizes_as_exprs()
+    # FIXME: This could be made specific to the current kernel piece.
+    group_size, local_size = kernel.get_grid_size_upper_bounds_as_exprs()
 
     group_axes = set(ax for ax, length in enumerate(group_size))
     local_axes = set(ax for ax, length in enumerate(local_size))
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 4056c0dea785a6d27225a4fd2b3337c9c9a14db9..ec0b39835de4da3df95d013ccdfb0417d0ab2b91 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -36,111 +36,95 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-# {{{ support code for AST wrapper objects
-
-class GeneratedInstruction(Record):
-    """Objects of this type are wrapped around ASTs upon
-    return from generation calls to collect information about them.
-
-    :ivar implemented_domains: A map from an insn id to a list of
-        implemented domains, with the purpose of checking that
-        each instruction's exact iteration space has been covered.
-    """
-    __slots__ = ["insn_id", "implemented_domain", "ast"]
-
-
-class GeneratedCode(Record):
-    """Objects of this type are wrapped around ASTs upon
-    return from generation calls to collect information about them.
+# {{{ implemented data info
 
-    :ivar implemented_domains: A map from an insn id to a list of
-        implemented domains, with the purpose of checking that
-        each instruction's exact iteration space has been covered.
+class ImplementedDataInfo(Record):
     """
-    __slots__ = ["ast", "implemented_domains"]
-
-
-def gen_code_block(elements):
-    from cgen import Block, Comment, Line, Initializer
-
-    block_els = []
-    implemented_domains = {}
+    .. attribute:: name
 
-    for el in elements:
-        if isinstance(el, GeneratedCode):
-            for insn_id, idoms in six.iteritems(el.implemented_domains):
-                implemented_domains.setdefault(insn_id, []).extend(idoms)
+        The expanded name of the array. Note that, for example
+        in the case of separate-array-tagged axes, multiple
+        implemented arrays may correspond to one user-facing
+        array.
 
-            if isinstance(el.ast, Block):
-                block_els.extend(el.ast.contents)
-            else:
-                block_els.append(el.ast)
+    .. attribute:: dtype
 
-        elif isinstance(el, Initializer):
-            block_els.append(el)
+    .. attribute:: arg_class
 
-        elif isinstance(el, Comment):
-            block_els.append(el)
+    .. attribute:: base_name
 
-        elif isinstance(el, Line):
-            assert not el.text
-            block_els.append(el)
+        The user-facing name of the underlying array.
+        May be *None* for non-array arguments.
 
-        elif isinstance(el, GeneratedInstruction):
-            block_els.append(el.ast)
-            if el.implemented_domain is not None:
-                implemented_domains.setdefault(el.insn_id, []).append(
-                        el.implemented_domain)
+    .. attribute:: shape
+    .. attribute:: strides
 
-        else:
-            raise ValueError("unrecognized object of type '%s' in block"
-                    % type(el))
+        Strides in multiples of ``dtype.itemsize``.
 
-    if len(block_els) == 1:
-        ast, = block_els
-    else:
-        ast = Block(block_els)
+    .. attribute:: unvec_shape
+    .. attribute:: unvec_strides
 
-    return GeneratedCode(ast=ast, implemented_domains=implemented_domains)
+        Strides in multiples of ``dtype.itemsize`` that accounts for
+        :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar
+        manner
 
 
-def wrap_in(cls, *args):
-    inner = args[-1]
-    args = args[:-1]
+    .. attribute:: offset_for_name
+    .. attribute:: stride_for_name_and_axis
 
-    if not isinstance(inner, GeneratedCode):
-        raise ValueError("unrecognized object of type '%s' in block"
-                % type(inner))
+        A tuple *(name, axis)* indicating the (implementation-facing)
+        name of the array and axis number for which this argument provides
+        the strides.
 
-    args = args + (inner.ast,)
+    .. attribute:: allows_offset
+    .. attribute:: is_written
+    """
 
-    return GeneratedCode(ast=cls(*args),
-            implemented_domains=inner.implemented_domains)
+    def __init__(self, target, name, dtype, arg_class,
+            base_name=None,
+            shape=None, strides=None,
+            unvec_shape=None, unvec_strides=None,
+            offset_for_name=None, stride_for_name_and_axis=None,
+            allows_offset=None,
+            is_written=None):
 
+        from loopy.types import LoopyType
+        assert isinstance(dtype, LoopyType)
 
-def wrap_in_if(condition_codelets, inner):
-    from cgen import If
+        Record.__init__(self,
+                name=name,
+                dtype=dtype,
+                arg_class=arg_class,
+                base_name=base_name,
+                shape=shape,
+                strides=strides,
+                unvec_shape=unvec_shape,
+                unvec_strides=unvec_strides,
+                offset_for_name=offset_for_name,
+                stride_for_name_and_axis=stride_for_name_and_axis,
+                allows_offset=allows_offset,
+                is_written=is_written)
 
-    if condition_codelets:
-        return wrap_in(If,
-                "\n&& ".join(condition_codelets),
-                inner)
+# }}}
 
-    return inner
 
+# {{{ code generation state
 
-def add_comment(cmt, code):
-    if cmt is None:
-        return code
+class Unvectorizable(Exception):
+    pass
 
-    from cgen import add_comment
-    assert isinstance(code, GeneratedCode)
 
-    return GeneratedCode(
-            ast=add_comment(cmt, code.ast),
-            implemented_domains=code.implemented_domains)
+class VectorizationInfo(object):
+    """
+    .. attribute:: iname
+    .. attribute:: length
+    .. attribute:: space
+    """
 
-# }}}
+    def __init__(self, iname, length, space):
+        self.iname = iname
+        self.length = length
+        self.space = space
 
 
 class SeenFunction(Record):
@@ -163,28 +147,13 @@ class SeenFunction(Record):
                 + tuple((f, getattr(self, f)) for f in type(self).fields))
 
 
-# {{{ code generation state
-
-class Unvectorizable(Exception):
-    pass
-
-
-class VectorizationInfo(object):
-    """
-    .. attribute:: iname
-    .. attribute:: length
-    .. attribute:: space
-    """
-
-    def __init__(self, iname, length, space):
-        self.iname = iname
-        self.length = length
-        self.space = space
-
-
 class CodeGenerationState(object):
     """
     .. attribute:: kernel
+    .. attribute:: implemented_data_info
+
+        a list of :class:`ImplementedDataInfo` objects.
+
     .. attribute:: implemented_domain
 
         The entire implemented domain (as an :class:`islpy.Set`)
@@ -212,13 +181,28 @@ class CodeGenerationState(object):
     .. attribute:: vectorization_info
 
         None or an instance of :class:`VectorizationInfo`
+
+    .. attribute:: is_generating_device_code
+
+    .. attribute:: gen_program_name
+
+        None (indicating that host code is being generated)
+        or the name of the device program currently being
+        generated.
+
+    .. attribute:: schedule_index_end
     """
 
-    def __init__(self, kernel, implemented_domain, implemented_predicates,
+    def __init__(self, kernel,
+            implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
-            vectorization_info=None, var_name_generator=None):
+            vectorization_info=None, var_name_generator=None,
+            is_generating_device_code=None,
+            gen_program_name=None,
+            schedule_index_end=None):
         self.kernel = kernel
+        self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
         self.seen_dtypes = seen_dtypes
@@ -228,11 +212,21 @@ class CodeGenerationState(object):
         self.allow_complex = allow_complex
         self.vectorization_info = vectorization_info
         self.var_name_generator = var_name_generator
+        self.is_generating_device_code = is_generating_device_code
+        self.gen_program_name = gen_program_name
+        self.schedule_index_end = schedule_index_end
 
     # {{{ copy helpers
 
-    def copy(self, implemented_domain=None, implemented_predicates=frozenset(),
-            var_subst_map=None, vectorization_info=None):
+    def copy(self, kernel=None,
+            implemented_domain=None, implemented_predicates=frozenset(),
+            var_subst_map=None, vectorization_info=None,
+            is_generating_device_code=None,
+            gen_program_name=None,
+            schedule_index_end=None):
+
+        if kernel is None:
+            kernel = self.kernel
 
         if vectorization_info is False:
             vectorization_info = None
@@ -240,8 +234,18 @@ class CodeGenerationState(object):
         elif vectorization_info is None:
             vectorization_info = self.vectorization_info
 
+        if is_generating_device_code is None:
+            is_generating_device_code = self.is_generating_device_code
+
+        if gen_program_name is None:
+            gen_program_name = self.gen_program_name
+
+        if schedule_index_end is None:
+            schedule_index_end = self.schedule_index_end
+
         return CodeGenerationState(
-                kernel=self.kernel,
+                kernel=kernel,
+                implemented_data_info=self.implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
                     implemented_predicates or self.implemented_predicates),
@@ -251,7 +255,10 @@ class CodeGenerationState(object):
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
                 vectorization_info=vectorization_info,
-                var_name_generator=self.var_name_generator)
+                var_name_generator=self.var_name_generator,
+                is_generating_device_code=is_generating_device_code,
+                gen_program_name=gen_program_name,
+                schedule_index_end=schedule_index_end)
 
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
@@ -270,10 +277,7 @@ class CodeGenerationState(object):
 
     @property
     def expression_to_code_mapper(self):
-        # It's kind of unfortunate that this is here, but it's an accident
-        # of history for now.
-
-        return self.kernel.target.get_expression_to_code_mapper(self)
+        return self.ast_builder.get_expression_to_code_mapper(self)
 
     def intersect(self, other):
         new_impl, new_other = isl.align_two(self.implemented_domain, other)
@@ -334,118 +338,17 @@ class CodeGenerationState(object):
         for i in range(vinf.length):
             idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
             new_codegen_state = novec_self.fix(vinf.iname, idx_aff)
-            result.append(func(new_codegen_state))
-
-        return gen_code_block(result)
-# }}}
-
-
-# {{{ cgen overrides
-
-from cgen import Declarator
-
-
-class POD(Declarator):
-    """A simple declarator: The type is given as a :class:`numpy.dtype`
-    and the *name* is given as a string.
-    """
-
-    def __init__(self, target, dtype, name):
-        from loopy.types import LoopyType
-        assert isinstance(dtype, LoopyType)
-
-        self.target = target
-        self.ctype = target.dtype_to_typename(dtype)
-        self.dtype = dtype
-        self.name = name
-
-    def get_decl_pair(self):
-        return [self.ctype], self.name
-
-    def struct_maker_code(self, name):
-        return name
-
-    def struct_format(self):
-        return self.dtype.char
-
-    def alignment_requirement(self):
-        return self.target.alignment_requirement(self)
-
-    def default_value(self):
-        return 0
-
-# }}}
+            result.extend(func(new_codegen_state))
 
+        from loopy.codegen.result import merge_codegen_results
+        return merge_codegen_results(self, result)
 
-# {{{ implemented data info
-
-class ImplementedDataInfo(Record):
-    """
-    .. attribute:: name
-
-        The expanded name of the array. Note that, for example
-        in the case of separate-array-tagged axes, multiple
-        implemented arrays may correspond to one user-facing
-        array.
-
-    .. attribute:: dtype
-    .. attribute:: cgen_declarator
-
-        Declarator syntax tree as a :mod:`cgen` object.
-
-    .. attribute:: arg_class
-
-    .. attribute:: base_name
-
-        The user-facing name of the underlying array.
-        May be *None* for non-array arguments.
-
-    .. attribute:: shape
-    .. attribute:: strides
-
-        Strides in multiples of ``dtype.itemsize``.
-
-    .. attribute:: unvec_shape
-    .. attribute:: unvec_strides
-
-        Strides in multiples of ``dtype.itemsize`` that accounts for
-        :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar
-        manner
-
-
-    .. attribute:: offset_for_name
-    .. attribute:: stride_for_name_and_axis
-
-        A tuple *(name, axis)* indicating the (implementation-facing)
-        name of the array and axis number for which this argument provides
-        the strides.
-
-    .. attribute:: allows_offset
-    """
-
-    def __init__(self, target, name, dtype, cgen_declarator, arg_class,
-            base_name=None,
-            shape=None, strides=None,
-            unvec_shape=None, unvec_strides=None,
-            offset_for_name=None, stride_for_name_and_axis=None,
-            allows_offset=None):
-
-        from loopy.types import LoopyType
-        assert isinstance(dtype, LoopyType)
-
-        Record.__init__(self,
-                name=name,
-                dtype=dtype,
-                cgen_declarator=cgen_declarator,
-                arg_class=arg_class,
-                base_name=base_name,
-                shape=shape,
-                strides=strides,
-                unvec_shape=unvec_shape,
-                unvec_strides=unvec_strides,
-                offset_for_name=offset_for_name,
-                stride_for_name_and_axis=stride_for_name_and_axis,
-                allows_offset=allows_offset)
+    @property
+    def ast_builder(self):
+        if self.is_generating_device_code:
+            return self.kernel.target.get_device_ast_builder()
+        else:
+            return self.kernel.target.get_host_ast_builder()
 
 # }}}
 
@@ -456,11 +359,10 @@ code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
 
 # {{{ main code generation entrypoint
 
-def generate_code(kernel, device=None):
-    if device is not None:
-        from warnings import warn
-        warn("passing 'device' to generate_code() is deprecated",
-                DeprecationWarning, stacklevel=2)
+def generate_code_v2(kernel):
+    """
+    :returns: a :class:`CodeGenerationResult`
+    """
 
     from loopy.kernel import kernel_state
     if kernel.state == kernel_state.INITIAL:
@@ -503,23 +405,24 @@ def generate_code(kernel, device=None):
     from loopy.kernel.data import ValueArg
     from loopy.kernel.array import ArrayBase
 
-    impl_arg_info = []
+    implemented_data_info = []
 
     for arg in kernel.args:
+        is_written = arg.name in kernel.get_written_variables()
         if isinstance(arg, ArrayBase):
-            impl_arg_info.extend(
+            implemented_data_info.extend(
                     arg.decl_info(
                         kernel.target,
-                        is_written=arg.name in kernel.get_written_variables(),
+                        is_written=is_written,
                         index_dtype=kernel.index_dtype))
 
         elif isinstance(arg, ValueArg):
-            impl_arg_info.append(ImplementedDataInfo(
+            implemented_data_info.append(ImplementedDataInfo(
                 target=kernel.target,
                 name=arg.name,
                 dtype=arg.dtype,
-                cgen_declarator=arg.get_arg_decl(kernel.target),
-                arg_class=ValueArg))
+                arg_class=ValueArg,
+                is_written=is_written))
 
         else:
             raise ValueError("argument type not understood: '%s'" % type(arg))
@@ -538,6 +441,7 @@ def generate_code(kernel, device=None):
     initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
     codegen_state = CodeGenerationState(
             kernel=kernel,
+            implemented_data_info=implemented_data_info,
             implemented_domain=initial_implemented_domain,
             implemented_predicates=frozenset(),
             seen_dtypes=seen_dtypes,
@@ -545,14 +449,21 @@ def generate_code(kernel, device=None):
             seen_atomic_dtypes=seen_atomic_dtypes,
             var_subst_map={},
             allow_complex=allow_complex,
-            var_name_generator=kernel.get_var_name_generator())
+            var_name_generator=kernel.get_var_name_generator(),
+            is_generating_device_code=False,
+            gen_program_name=kernel.name,
+            schedule_index_end=len(kernel.schedule))
+
+    from loopy.codegen.result import generate_host_or_device_program
+    codegen_result = generate_host_or_device_program(
+            codegen_state,
+            schedule_index=0)
 
-    code_str, implemented_domains = kernel.target.generate_code(
-            kernel, codegen_state, impl_arg_info)
+    device_code_str = codegen_result.device_code()
 
     from loopy.check import check_implemented_domains
-    assert check_implemented_domains(kernel, implemented_domains,
-            code_str)
+    assert check_implemented_domains(kernel, codegen_result.implemented_domains,
+            device_code_str)
 
     # {{{ handle preambles
 
@@ -576,7 +487,7 @@ def generate_code(kernel, device=None):
             seen_atomic_dtypes=seen_atomic_dtypes)
 
     preamble_generators = (kernel.preamble_generators
-            + kernel.target.preamble_generators())
+            + kernel.target.get_device_ast_builder().preamble_generators())
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -595,71 +506,48 @@ def generate_code(kernel, device=None):
             remove_common_indentation(lines) + "\n"
             for lines in dedup_preambles]
 
-    code_str = "".join(preamble_codes) + code_str
+    codegen_result = codegen_result.copy(
+            device_preambles=preamble_codes)
 
     # }}}
 
     logger.info("%s: generate code: done" % kernel.name)
 
-    result = code_str, impl_arg_info
-
     if CACHING_ENABLED:
-        code_gen_cache[input_kernel] = result
+        code_gen_cache[input_kernel] = codegen_result
 
-    return result
+    return codegen_result
 
-# }}}
-
-
-# {{{ generate function body
 
-def generate_body(kernel):
-    if kernel.schedule is None:
-        from loopy.schedule import get_one_scheduled_kernel
-        kernel = get_one_scheduled_kernel(kernel)
-    from loopy.kernel import kernel_state
-    if kernel.state != kernel_state.SCHEDULED:
-        raise LoopyError("cannot generate code for a kernel that has not been "
-                "scheduled")
+def generate_code(kernel, device=None):
+    if device is not None:
+        from warnings import warn
+        warn("passing 'device' to generate_code() is deprecated",
+                DeprecationWarning, stacklevel=2)
 
-    from loopy.preprocess import infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
+    codegen_result = generate_code_v2(kernel)
 
-    from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel)
+    if len(codegen_result.device_programs) > 1:
+        raise LoopyError("kernel passed to generate_code yielded multiple "
+                "device programs. Use generate_code_v2.")
 
-    logger.info("%s: generate code: start" % kernel.name)
+    return codegen_result.device_code(), codegen_result.implemented_data_info
 
-    allow_complex = False
-    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
-        if var.dtype.involves_complex():
-            allow_complex = True
+# }}}
 
-    seen_dtypes = set()
-    seen_functions = set()
-    seen_atomic_dtypes = set()
 
-    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
-    codegen_state = CodeGenerationState(
-            kernel=kernel,
-            implemented_domain=initial_implemented_domain,
-            implemented_predicates=frozenset(),
-            seen_dtypes=seen_dtypes,
-            seen_functions=seen_functions,
-            seen_atomic_dtypes=seen_atomic_dtypes,
-            var_subst_map={},
-            allow_complex=allow_complex)
+# {{{ generate function body
 
-    code_str, implemented_domains = kernel.target.generate_body(
-            kernel, codegen_state)
+def generate_body(kernel):
+    codegen_result = generate_code_v2(kernel)
 
-    from loopy.check import check_implemented_domains
-    assert check_implemented_domains(kernel, implemented_domains,
-            code_str)
+    if len(codegen_result.device_programs) != 1:
+        raise LoopyError("generate_body cannot be used on programs "
+                "that yield more than one device program")
 
-    logger.info("%s: generate code: done" % kernel.name)
+    dev_prg, = codegen_result.device_programs
 
-    return code_str
+    return str(dev_prg.body_ast)
 
 # }}}
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 19ac4106ba58821e5d4bf5231eb530977739c7f3..fb254bd54480f716de54de96f6aab9a4bb427767 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -25,17 +25,6 @@ THE SOFTWARE.
 
 import islpy as isl
 from islpy import dim_type
-from pymbolic.mapper.stringifier import PREC_NONE
-
-
-def constraint_to_code(ecm, cns):
-    if cns.is_equality():
-        comp_op = "=="
-    else:
-        comp_op = ">="
-
-    from loopy.symbolic import constraint_to_expr
-    return "%s %s 0" % (ecm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op)
 
 
 # {{{ bounds check generator
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 0b509fad8ec2d3ae6a21d5a228e0fd578cb0ab4c..7b73093873e7a5364945f8888723be639f37dcf4 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -27,17 +27,20 @@ THE SOFTWARE.
 """
 
 
-from loopy.codegen import gen_code_block
+from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
-from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
-        gather_schedule_subloop, generate_sub_sched_items)
+from loopy.schedule import (
+        EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel,
+        gather_schedule_block, generate_sub_sched_items)
 
 
-def get_admissible_conditional_inames_for(kernel, sched_index):
+def get_admissible_conditional_inames_for(codegen_state, sched_index):
     """This function disallows conditionals on local-idx tagged
     inames if there is a barrier nested somewhere within.
     """
 
+    kernel = codegen_state.kernel
+
     from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
@@ -46,17 +49,48 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
     has_barrier = has_barrier_within(kernel, sched_index)
 
     for iname, tag in six.iteritems(kernel.iname_to_tag):
-        if isinstance(tag, HardwareParallelTag):
+        if (isinstance(tag, HardwareParallelTag)
+                and codegen_state.is_generating_device_code):
             if not has_barrier or not isinstance(tag, LocalIndexTag):
                 result.add(iname)
 
     return frozenset(result)
 
 
-def generate_code_for_sched_index(kernel, sched_index, codegen_state):
+def generate_code_for_sched_index(codegen_state, sched_index):
+    kernel = codegen_state.kernel
     sched_item = kernel.schedule[sched_index]
 
-    if isinstance(sched_item, EnterLoop):
+    if isinstance(sched_item, CallKernel):
+        assert not codegen_state.is_generating_device_code
+
+        from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at)
+        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
+        assert past_end_i <= codegen_state.schedule_index_end
+
+        new_codegen_state = codegen_state.copy(
+                is_generating_device_code=True,
+                gen_program_name=sched_item.kernel_name,
+                schedule_index_end=past_end_i-1)
+
+        from loopy.codegen.result import generate_host_or_device_program
+        codegen_result = generate_host_or_device_program(
+                new_codegen_state, sched_index + 1)
+
+        glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
+                get_insn_ids_for_block_at(kernel.schedule, sched_index))
+
+        return merge_codegen_results(codegen_state, [
+            codegen_result,
+
+            codegen_state.ast_builder.get_kernel_call(
+                codegen_state,
+                sched_item.kernel_name,
+                glob_grid, loc_grid,
+                ()),
+            ])
+
+    elif isinstance(sched_item, EnterLoop):
         tag = kernel.iname_to_tag.get(sched_item.iname)
 
         from loopy.codegen.loop import (
@@ -76,10 +110,11 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
             raise RuntimeError("encountered (invalid) EnterLoop "
                     "for '%s', tagged '%s'" % (sched_item.iname, tag))
 
-        return func(kernel, sched_index, codegen_state)
+        return func(codegen_state, sched_index)
 
     elif isinstance(sched_item, Barrier):
-        return kernel.target.emit_barrier(sched_item.kind, sched_item.comment)
+        return codegen_state.ast_builder.emit_barrier(
+                sched_item.kind, sched_item.comment)
 
     elif isinstance(sched_item, RunInstruction):
         insn = kernel.id_to_insn[sched_item.insn_id]
@@ -87,7 +122,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
         from loopy.codegen.instruction import generate_instruction_code
         return codegen_state.try_vectorized(
                 "instruction %s" % insn.id,
-                lambda inner_cgs: generate_instruction_code(kernel, insn, inner_cgs))
+                lambda inner_cgs: generate_instruction_code(inner_cgs, insn))
 
     else:
         raise RuntimeError("unexpected schedule item type: %s"
@@ -132,18 +167,20 @@ def group_by(l, key, merge):
     return result
 
 
-def build_loop_nest(kernel, sched_index, codegen_state):
+def build_loop_nest(codegen_state, schedule_index):
     # Most of the complexity of this function goes towards finding groups of
     # instructions that can be nested inside a shared conditional.
 
+    kernel = codegen_state.kernel
+
     # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
 
     # i.e. go up to the next LeaveLoop, and skip over inner loops.
 
     my_sched_indices = []
 
-    i = sched_index
-    while i < len(kernel.schedule):
+    i = schedule_index
+    while i < codegen_state.schedule_index_end:
         sched_item = kernel.schedule[i]
 
         if isinstance(sched_item, LeaveLoop):
@@ -151,9 +188,11 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
         my_sched_indices.append(i)
 
-        if isinstance(sched_item, EnterLoop):
-            _, i = gather_schedule_subloop(
-                    kernel.schedule, i)
+        if isinstance(sched_item, (EnterLoop, CallKernel)):
+            _, i = gather_schedule_block(kernel.schedule, i)
+            assert i <= codegen_state.schedule_index_end, \
+                    "schedule block extends beyond schedule_index_end"
+
         elif isinstance(sched_item, Barrier):
             i += 1
 
@@ -184,7 +223,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             ScheduleIndexInfo(
                 schedule_indices=[i],
                 admissible_cond_inames=(
-                    get_admissible_conditional_inames_for(kernel, i)),
+                    get_admissible_conditional_inames_for(codegen_state, i)),
                 required_predicates=get_required_predicates(kernel, i),
                 used_inames_within=find_used_inames_within(kernel, i)
                 )
@@ -353,7 +392,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                     result = []
                     for i in origin_si_entry.schedule_indices:
                         inner = generate_code_for_sched_index(
-                            kernel, i, inner_codegen_state)
+                            inner_codegen_state, i)
 
                         if inner is not None:
                             result.append(inner)
@@ -372,22 +411,23 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             # gen_code returns a list
 
             if bounds_checks or pred_checks:
-                from loopy.codegen import wrap_in_if
-                from loopy.codegen.bounds import constraint_to_code
+                from loopy.symbolic import constraint_to_expr
 
                 prev_gen_code = gen_code
 
                 def gen_code(inner_codegen_state):
-                    conditionals = [
-                            constraint_to_code(
-                                inner_codegen_state.expression_to_code_mapper, cns)
-                            for cns in bounds_checks] + list(pred_checks)
+                    from pymbolic.primitives import Variable
+                    condition_exprs = [
+                            constraint_to_expr(cns)
+                            for cns in bounds_checks] + [
+                                Variable(pred_chk) for pred_chk in pred_checks]
 
                     prev_result = prev_gen_code(inner_codegen_state)
 
                     return [wrap_in_if(
-                             conditionals,
-                             gen_code_block(prev_result))]
+                        inner_codegen_state,
+                        condition_exprs,
+                        merge_codegen_results(codegen_state, prev_result))]
 
                 cannot_vectorize = False
                 if new_codegen_state.vectorization_info is not None:
@@ -404,7 +444,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                         # gen_code returns a list, but this needs to return a
                         # GeneratedCode instance.
 
-                        return gen_code_block(gen_code(inner_codegen_state))
+                        return gen_code(inner_codegen_state)
 
                     result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                 else:
@@ -418,8 +458,10 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     # }}}
 
-    return gen_code_block(
-            build_insn_group(sched_index_info_entries, codegen_state))
+    insn_group = build_insn_group(sched_index_info_entries, codegen_state)
+    return merge_codegen_results(
+            codegen_state,
+            insn_group)
 
 
 # vim: foldmethod=marker
diff --git a/loopy/codegen/device_mapping.py b/loopy/codegen/device_mapping.py
index d40b0c4d255eda0159c77fbf56889b0a55e5f347..af6f9712a73da22cdc461a76444adaf0dd4b9e6e 100644
--- a/loopy/codegen/device_mapping.py
+++ b/loopy/codegen/device_mapping.py
@@ -1,25 +1,43 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+(unclear)
+"""
+
+# TODO: Matt, please replace the license header
+# TODO: Should move to loopy.schedule.device_mapping
+
 from pytools import Record
+from loopy.diagnostic import LoopyError
 
 
 class HostForLoop(Record):
+    # TOOD: Should have docstring indicating what attributes can occur
     pass
 
 
 class HostConditional(Record):
+    # TOOD: Should have docstring indicating what attributes can occur
     pass
 
 
 class HostBlock(Record):
+    # TOOD: Should have docstring indicating what attributes can occur
     pass
 
 
 class HostInvokeKernel(Record):
+    # TOOD: Should have docstring indicating what attributes can occur
     pass
 
 
 def map_schedule_onto_host_or_device(kernel):
-    from pytools import UniqueNameGenerator
-    kernel_name_gen = UniqueNameGenerator(forced_prefix=kernel.name)
+    from functools import partial
+    kernel_name_gen = partial(
+            kernel.get_var_name_generator(),
+            kernel.name + kernel.target.device_program_name_suffix)
 
     from loopy.schedule import (
         RunInstruction, EnterLoop, LeaveLoop, Barrier,
@@ -67,7 +85,6 @@ def map_schedule_onto_host_or_device(kernel):
                 if loop_required_splitting:
                     schedule_required_splitting = True
                     if current_chunk:
-                        # TODO: Do a better job of naming the kernel...
                         new_kernel_name = kernel_name_gen()
                         new_schedule.extend(
                             # TODO: Infer kernel arguments
@@ -92,7 +109,6 @@ def map_schedule_onto_host_or_device(kernel):
                     # Wrap the current chunk into a kernel call.
                     schedule_required_splitting = True
                     if current_chunk:
-                        # TODO: Do a better job of naming the kernel
                         new_kernel_name = kernel_name_gen()
                         new_schedule.extend(
                             # TODO: Infer kernel arguments
@@ -106,8 +122,8 @@ def map_schedule_onto_host_or_device(kernel):
                     current_chunk.append(sched_item)
                 i += 1
             else:
-                # TODO: Make error message more informative.
-                raise ValueError()
+                raise LoopyError("unexepcted type of schedule item: %s"
+                        % type(sched_item).__name__)
 
         if current_chunk and schedule_required_splitting:
             # Wrap remainder of schedule into a kernel call.
@@ -131,7 +147,7 @@ def map_schedule_onto_host_or_device(kernel):
     if not split_kernel:
         # Wrap everything into a kernel call.
         new_schedule = (
-            [CallKernel(kernel_name=kernel.name)] +
+            [CallKernel(kernel_name=kernel_name_gen())] +
             new_schedule +
             [ReturnFromKernel(kernel_name=kernel.name)])
     new_kernel = kernel.copy(schedule=new_schedule)
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 7b95f59482c826473e069d9f07a80dd97d7ea0ce..db3d15184a94aa13e6a4b449a2036869e34ca566 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -27,12 +27,16 @@ THE SOFTWARE.
 
 from six.moves import range
 import islpy as isl
-from loopy.codegen import GeneratedInstruction, Unvectorizable
+from loopy.codegen import Unvectorizable
+from loopy.codegen.result import CodeGenerationResult
 from pymbolic.mapper.stringifier import PREC_NONE
 
 
-def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, stmt):
-    from loopy.codegen.bounds import get_bounds_checks, constraint_to_code
+def to_codegen_result(
+        codegen_state, insn_id, domain, check_inames, required_preds, ast):
+    from loopy.codegen.bounds import get_bounds_checks
+    from loopy.symbolic import constraint_to_expr
+
     bounds_checks = get_bounds_checks(
             domain, check_inames,
             codegen_state.implemented_domain, overapproximate=False)
@@ -43,53 +47,54 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st
     new_implemented_domain = new_implemented_domain & bounds_check_set
 
     if bounds_check_set.is_empty():
-        return None, None
+        return None
 
-    condition_codelets = [
-            constraint_to_code(
-                codegen_state.expression_to_code_mapper, cns)
+    condition_exprs = [
+            constraint_to_expr(cns)
             for cns in bounds_checks]
 
-    condition_codelets.extend(
+    condition_exprs.extend(
             required_preds - codegen_state.implemented_predicates)
 
-    if condition_codelets:
-        from cgen import If
-        stmt = If("\n&& ".join(condition_codelets), stmt)
+    if condition_exprs:
+        from pymbolic.primitives import LogicalAnd
+        from pymbolic.mapper.stringifier import PREC_NONE
+        ast = codegen_state.ast_builder.emit_if(
+                codegen_state.expression_to_code_mapper(
+                    LogicalAnd(tuple(condition_exprs)), PREC_NONE),
+                ast)
 
-    return stmt, new_implemented_domain
+    return CodeGenerationResult.new(
+            codegen_state, insn_id, ast, new_implemented_domain)
 
 
-def generate_instruction_code(kernel, insn, codegen_state):
+def generate_instruction_code(codegen_state, insn):
+    kernel = codegen_state.kernel
+
     from loopy.kernel.data import Assignment, CallInstruction, CInstruction
 
     if isinstance(insn, Assignment):
-        result = generate_expr_instruction_code(kernel, insn, codegen_state)
+        ast = generate_assignment_instruction_code(codegen_state, insn)
     elif isinstance(insn, CallInstruction):
-        result = generate_call_code(kernel, insn, codegen_state)
+        ast = generate_call_code(codegen_state, insn)
     elif isinstance(insn, CInstruction):
-        result = generate_c_instruction_code(kernel, insn, codegen_state)
+        ast = generate_c_instruction_code(codegen_state, insn)
     else:
         raise RuntimeError("unexpected instruction type")
 
     insn_inames = kernel.insn_inames(insn)
 
-    insn_code, impl_domain = wrap_in_conditionals(
+    return to_codegen_result(
             codegen_state,
+            insn.id,
             kernel.get_inames_domain(insn_inames), insn_inames,
             insn.predicates,
-            result)
+            ast)
 
-    if insn_code is None:
-        return None
 
-    return GeneratedInstruction(
-        insn_id=insn.id,
-        implemented_domain=impl_domain,
-        ast=insn_code)
+def generate_assignment_instruction_code(codegen_state, insn):
+    kernel = codegen_state.kernel
 
-
-def generate_expr_instruction_code(kernel, insn, codegen_state):
     ecm = codegen_state.expression_to_code_mapper
 
     from loopy.expression import dtype_to_type_context, VectorizabilityChecker
@@ -137,8 +142,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
     rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
     if lhs_atomicity is None:
-        from cgen import Assign
-        result = Assign(
+        result = codegen_state.ast_builder.emit_assignment(
+                codegen_state,
                 lhs_code,
                 ecm(insn.expression, prec=PREC_NONE,
                     type_context=rhs_type_context,
@@ -149,7 +154,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
 
     elif isinstance(lhs_atomicity, AtomicUpdate):
         codegen_state.seen_atomic_dtypes.add(lhs_dtype)
-        result = kernel.target.generate_atomic_update(
+        result = codegen_state.ast_builder.generate_atomic_update(
                 kernel, codegen_state, lhs_atomicity, lhs_var,
                 insn.assignee, insn.expression,
                 lhs_dtype, rhs_type_context)
@@ -166,7 +171,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
 
         from cgen import Statement as S  # noqa
 
-        gs, ls = kernel.get_grid_sizes()
+        gs, ls = kernel.get_grid_size_upper_bounds()
 
         printf_format = "%s.%s[%s][%s]: %s" % (
                 kernel.name,
@@ -220,7 +225,9 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     return result
 
 
-def generate_call_code(kernel, insn, codegen_state):
+def generate_call_code(codegen_state, insn):
+    kernel = codegen_state.kernel
+
     # {{{ vectorization handling
 
     if codegen_state.vectorization_info:
@@ -229,7 +236,7 @@ def generate_call_code(kernel, insn, codegen_state):
 
     # }}}
 
-    result = kernel.target.generate_multiple_assignment(
+    result = codegen_state.ast_builder.emit_multiple_assignment(
             codegen_state, insn)
 
     # {{{ tracing
@@ -242,13 +249,15 @@ def generate_call_code(kernel, insn, codegen_state):
     return result
 
 
-def generate_c_instruction_code(kernel, insn, codegen_state):
+def generate_c_instruction_code(codegen_state, insn):
+    kernel = codegen_state.kernel
+
     if codegen_state.vectorization_info is not None:
         raise Unvectorizable("C instructions cannot be vectorized")
 
     body = []
 
-    from loopy.codegen import POD
+    from loopy.target.c import POD
     from cgen import Initializer, Block, Line
 
     from pymbolic.primitives import Variable
@@ -260,7 +269,7 @@ def generate_c_instruction_code(kernel, insn, codegen_state):
 
         body.append(
                 Initializer(
-                    POD(kernel.target, kernel.index_dtype, name),
+                    POD(codegen_state.ast_builder, kernel.index_dtype, name),
                     codegen_state.expression_to_code_mapper(
                         iname_expr, prec=PREC_NONE, type_context="i")))
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 70530d3a90cbdd5a48395034dd05000f6781b473..be7840f27e92dc0339cd678e9dca40421462ac04 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -25,7 +25,7 @@ THE SOFTWARE.
 from six.moves import range
 
 from loopy.diagnostic import warn, LoopyError
-from loopy.codegen import gen_code_block
+from loopy.codegen.result import merge_codegen_results
 import islpy as isl
 from islpy import dim_type
 from loopy.codegen.control import build_loop_nest
@@ -34,7 +34,7 @@ from pymbolic.mapper.stringifier import PREC_NONE
 
 # {{{ conditional-reducing slab decomposition
 
-def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
+def get_slab_decomposition(kernel, iname):
     iname_domain = kernel.get_inames_domain(iname)
 
     if iname_domain.is_empty():
@@ -116,7 +116,9 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 
 # {{{ unrolled loops
 
-def generate_unroll_loop(kernel, sched_index, codegen_state):
+def generate_unroll_loop(codegen_state, sched_index):
+    kernel = codegen_state.kernel
+
     iname = kernel.schedule[sched_index].iname
 
     bounds = kernel.get_iname_bounds(iname, constants_only=True)
@@ -147,16 +149,18 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
         idx_aff = lower_bound_aff + i
         new_codegen_state = codegen_state.fix(iname, idx_aff)
         result.append(
-                build_loop_nest(kernel, sched_index+1, new_codegen_state))
+                build_loop_nest(new_codegen_state, sched_index+1))
 
-    return gen_code_block(result)
+    return merge_codegen_results(codegen_state, result)
 
 # }}}
 
 
 # {{{ vectorized loops
 
-def generate_vectorize_loop(kernel, sched_index, codegen_state):
+def generate_vectorize_loop(codegen_state, sched_index):
+    kernel = codegen_state.kernel
+
     iname = kernel.schedule[sched_index].iname
 
     bounds = kernel.get_iname_bounds(iname, constants_only=True)
@@ -206,7 +210,7 @@ def generate_vectorize_loop(kernel, sched_index, codegen_state):
                 length=length,
                 space=length_aff.space))
 
-    return build_loop_nest(kernel, sched_index+1, new_codegen_state)
+    return build_loop_nest(new_codegen_state, sched_index+1)
 
 # }}}
 
@@ -222,8 +226,10 @@ def intersect_kernel_with_slab(kernel, slab, iname):
 
 # {{{ hw-parallel loop
 
-def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
+def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         hw_inames_left=None):
+    kernel = codegen_state.kernel
+
     from loopy.kernel.data import (
             UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
 
@@ -233,9 +239,11 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
                 if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
 
     if not hw_inames_left:
-        return build_loop_nest(kernel, sched_index, codegen_state)
+        return next_func(codegen_state)
 
-    global_size, local_size = kernel.get_grid_sizes()
+    from loopy.schedule import get_insn_ids_for_block_at
+    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
+            get_insn_ids_for_block_at(kernel.schedule, schedule_index))
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
@@ -293,8 +301,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
 
     # }}}
 
-    slabs = get_slab_decomposition(
-            kernel, iname, sched_index, codegen_state)
+    slabs = get_slab_decomposition(kernel, iname)
 
     if other_inames_with_same_tag and len(slabs) > 1:
         raise RuntimeError("cannot do slab decomposition on inames that share "
@@ -302,38 +309,39 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
 
     result = []
 
-    from loopy.codegen import add_comment
-
     for slab_name, slab in slabs:
-        cmt = "%s slab for '%s'" % (slab_name, iname)
         if len(slabs) == 1:
-            cmt = None
+            result.append(
+                    codegen_state.ast_builder.emit_comment(
+                        "%s slab for '%s'" % (slab_name, iname)))
 
         # Have the conditional infrastructure generate the
         # slabbing conditionals.
         slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
-        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)
+        new_codegen_state = (codegen_state
+                .copy_and_assign(iname, hw_axis_expr)
+                .copy(kernel=slabbed_kernel))
 
         inner = set_up_hw_parallel_loops(
-                slabbed_kernel, sched_index,
-                new_codegen_state, hw_inames_left)
+                new_codegen_state, schedule_index, next_func,
+                hw_inames_left)
 
-        result.append(add_comment(cmt, inner))
+        result.append(inner)
 
-    from loopy.codegen import gen_code_block
-    return gen_code_block(result)
+    return merge_codegen_results(codegen_state, result)
 
 # }}}
 
 
 # {{{ sequential loop
 
-def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
+def generate_sequential_loop_dim_code(codegen_state, sched_index):
+    kernel = codegen_state.kernel
+
     ecm = codegen_state.expression_to_code_mapper
     loop_iname = kernel.schedule[sched_index].iname
 
-    slabs = get_slab_decomposition(
-            kernel, loop_iname, sched_index, codegen_state)
+    slabs = get_slab_decomposition(kernel, loop_iname)
 
     from loopy.codegen.bounds import get_usable_inames_for_conditional
 
@@ -411,40 +419,45 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
                     dim_type.set, impl_slab.dim(dim_type.set),
                     dt, idx, 1)
 
-        new_codegen_state = codegen_state.intersect(impl_slab)
+        new_codegen_state = (
+                codegen_state
+                .intersect(impl_slab)
+                .copy(kernel=intersect_kernel_with_slab(
+                    kernel, slab, iname)))
 
-        inner = build_loop_nest(
-                intersect_kernel_with_slab(
-                    kernel, slab, iname),
-                sched_index+1, new_codegen_state)
+        inner = build_loop_nest(new_codegen_state, sched_index+1)
 
         # }}}
 
         if cmt is not None:
-            from cgen import Comment
-            result.append(Comment(cmt))
+            result.append(codegen_state.ast_builder.emit_comment(cmt))
 
-        from loopy.codegen import POD
-        from cgen import Initializer, Const, Line
         from loopy.symbolic import aff_to_expr
 
+        astb = codegen_state.ast_builder
+
         if (static_ubound - static_lbound).plain_is_zero():
             # single-trip, generate just a variable assignment, not a loop
-            result.append(gen_code_block([
-                Initializer(
-                    Const(POD(kernel.target, kernel.index_dtype, loop_iname)),
-                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
-                Line(),
+            result.append(merge_codegen_results(codegen_state, [
+                astb.emit_initializer(
+                    codegen_state,
+                    kernel.index_dtype, loop_iname,
+                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i"),
+                    is_const=True),
+                astb.emit_blank_line(),
                 inner,
                 ]))
 
         else:
+            inner_ast = inner.current_ast(codegen_state)
             result.append(
-                kernel.target.emit_sequential_loop(
-                       codegen_state, loop_iname, kernel.index_dtype,
-                       static_lbound, static_ubound, inner))
+                inner.with_new_ast(
+                    codegen_state,
+                    astb.emit_sequential_loop(
+                        codegen_state, loop_iname, kernel.index_dtype,
+                        static_lbound, static_ubound, inner_ast)))
 
-    return gen_code_block(result)
+    return merge_codegen_results(codegen_state, result)
 
 # }}}
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2c44fecb70ccb60304a0ef473ac9315c762880
--- /dev/null
+++ b/loopy/codegen/result.py
@@ -0,0 +1,280 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+from pytools import Record
+
+
+# {{{ code generation result
+
+class GeneratedProgram(Record):
+    """
+    .. attribute:: name
+
+    .. attribute:: is_device_program
+
+    .. attribute:: ast
+
+        Once generated, this captures the AST of the overall function
+        definition, including the body.
+
+    .. attribute:: body_ast
+
+        Once generated, this captures the AST of the operative function
+        body (including declaration of necessary temporaries), but not
+        the overall function definition.
+    """
+
+
+class CodeGenerationResult(Record):
+    """
+    .. attribute:: host_program
+    .. attribute:: device_programs
+
+        A list of :class:`GeneratedProgram` instances
+        intended to run on the compute device.
+
+    .. attribute:: implemented_domains
+
+        A mapping from instruction ID to a list of :class:`islpy.Set`
+        objects.
+
+    .. attribute:: host_preambles
+    .. attribute:: device_preambles
+
+    .. automethod:: host_code
+    .. automethod:: device_code
+
+    .. attribute:: implemented_data_info
+
+        a list of :class:`loopy.codegen.ImplementedDataInfo` objects.
+        Only added at the very end of code generation.
+    """
+
+    @staticmethod
+    def new(codegen_state, insn_id, ast, implemented_domain):
+        prg = GeneratedProgram(
+                name=codegen_state.gen_program_name,
+                is_device_program=codegen_state.is_generating_device_code,
+                ast=ast)
+
+        if codegen_state.is_generating_device_code:
+            kwargs = {
+                    "host_program": None,
+                    "device_programs": [prg],
+                    }
+        else:
+            kwargs = {
+                    "host_program": prg,
+                    "device_programs": [],
+                    }
+
+        return CodeGenerationResult(
+                implemented_data_info=codegen_state.implemented_data_info,
+                implemented_domains={insn_id: [implemented_domain]},
+                **kwargs)
+
+    def host_code(self):
+        preamble_codes = getattr(self, "host_preambles", [])
+
+        return (
+                "".join(preamble_codes)
+                +
+                str(self.host_program.ast))
+
+    def device_code(self):
+        preamble_codes = getattr(self, "device_preambles", [])
+
+        return (
+                "".join(preamble_codes)
+                + "\n"
+                + "\n\n".join(str(dp.ast) for dp in self.device_programs))
+
+    def current_program(self, codegen_state):
+        if codegen_state.is_generating_device_code:
+            if self.device_programs:
+                result = self.device_programs[-1]
+            else:
+                result = None
+        else:
+            result = self.host_program
+
+        if result is None:
+            ast = codegen_state.ast_builder.ast_block_class([])
+            result = GeneratedProgram(
+                    name=codegen_state.gen_program_name,
+                    is_device_program=codegen_state.is_generating_device_code,
+                    ast=ast)
+
+        assert result.name == codegen_state.gen_program_name
+        return result
+
+    def with_new_program(self, codegen_state, program):
+        if codegen_state.is_generating_device_code:
+            assert program.name == codegen_state.gen_program_name
+            assert program.is_device_program
+            return self.copy(
+                    device_programs=(
+                        self.device_programs[:-1]
+                        +
+                        [program]))
+        else:
+            assert program.name == codegen_state.gen_program_name
+            assert not program.is_device_program
+            return self.copy(host_program=program)
+
+    def current_ast(self, codegen_state):
+        return self.current_program(codegen_state).ast
+
+    def with_new_ast(self, codegen_state, new_ast):
+        return self.with_new_program(
+                codegen_state,
+                self.current_program(codegen_state).copy(
+                    ast=new_ast))
+
+
+# }}}
+
+
+# {{{ support code for AST merging
+
+def merge_codegen_results(codegen_state, elements, collapse=True):
+    elements = [el for el in elements if el is not None]
+
+    if not elements:
+        return CodeGenerationResult(
+                host_program=None,
+                device_programs=[],
+                implemented_domains={},
+                implemented_data_info=codegen_state.implemented_data_info)
+
+    ast_els = []
+    new_device_programs = []
+    dev_program_names = set()
+    implemented_domains = {}
+    codegen_result = None
+
+    block_cls = codegen_state.ast_builder.ast_block_class
+
+    for el in elements:
+        if isinstance(el, CodeGenerationResult):
+            if codegen_result is None:
+                codegen_result = el
+            else:
+                assert (
+                        el.current_program(codegen_state).name
+                        == codegen_result.current_program(codegen_state).name)
+
+            for insn_id, idoms in six.iteritems(el.implemented_domains):
+                implemented_domains.setdefault(insn_id, []).extend(idoms)
+
+            if not codegen_state.is_generating_device_code:
+                for dp in el.device_programs:
+                    if dp.name not in dev_program_names:
+                        new_device_programs.append(dp)
+                        dev_program_names.add(dp.name)
+
+            cur_ast = el.current_ast(codegen_state)
+            if isinstance(cur_ast, block_cls):
+                ast_els.extend(cur_ast.contents)
+            else:
+                ast_els.append(cur_ast)
+
+        else:
+            ast_els.append(el)
+
+    if collapse and len(ast_els) == 1:
+        ast, = ast_els
+    else:
+        ast = block_cls(ast_els)
+
+    kwargs = {}
+    if not codegen_state.is_generating_device_code:
+        kwargs["device_programs"] = new_device_programs
+
+    return (codegen_result
+            .with_new_ast(codegen_state, ast)
+            .copy(
+                implemented_domains=implemented_domains,
+                **kwargs))
+
+
+def wrap_in_if(codegen_state, condition_exprs, inner):
+    if condition_exprs:
+        from pymbolic.primitives import LogicalAnd
+        from pymbolic.mapper.stringifier import PREC_NONE
+        cur_ast = inner.current_ast(codegen_state)
+        return inner.with_new_ast(
+                codegen_state,
+                codegen_state.ast_builder.emit_if(
+                    codegen_state.expression_to_code_mapper(
+                        LogicalAnd(tuple(condition_exprs)), PREC_NONE),
+                    cur_ast))
+
+    return inner
+
+# }}}
+
+
+# {{{ program generation top-level
+
+def generate_host_or_device_program(codegen_state, schedule_index):
+    ast_builder = codegen_state.ast_builder
+    temp_decls = ast_builder.get_temporary_decls(codegen_state)
+
+    from functools import partial
+
+    from loopy.codegen.control import build_loop_nest
+    next_func = partial(build_loop_nest, schedule_index=schedule_index)
+
+    if codegen_state.is_generating_device_code:
+        from loopy.codegen.loop import set_up_hw_parallel_loops
+        codegen_result = set_up_hw_parallel_loops(
+                codegen_state, schedule_index, next_func=next_func)
+    else:
+        codegen_result = next_func(codegen_state)
+
+    codegen_result = merge_codegen_results(
+            codegen_state,
+            temp_decls + [codegen_result],
+            collapse=False)
+
+    cur_prog = codegen_result.current_program(codegen_state)
+    body_ast = cur_prog.ast
+    fdecl_ast = ast_builder.get_function_declaration(
+            codegen_state, codegen_result, schedule_index)
+
+    fdef_ast = ast_builder.get_function_definition(
+            codegen_state, codegen_result,
+            schedule_index, fdecl_ast, body_ast)
+
+    codegen_result = codegen_result.with_new_program(
+            codegen_state,
+            cur_prog.copy(
+                ast=fdef_ast,
+                body_ast=body_ast))
+
+    return codegen_result
+
+# }}}
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 2125fd66347c99adb4014a60c34a6ce44bc05e95..a30e4b133c60987213d5a62482b7a04a21cc05ad 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 import six
 from six.moves import range, zip
 
-import sys
 import numpy as np
 from pytools import Record, memoize_method
 from loopy.diagnostic import ParameterFinderWarning
@@ -137,7 +136,7 @@ def python_dtype_str(dtype):
 
 # {{{ integer arg finding from shapes
 
-def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options):
+def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info):
     # a mapping from integer argument names to a list of tuples
     # (arg_name, expression), where expression is a
     # unary function of kernel.arg_dict[arg_name]
@@ -149,7 +148,7 @@ def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options
     dep_map = DependencyMapper()
 
     from pymbolic import var
-    for arg in impl_arg_info:
+    for arg in implemented_data_info:
         if arg.arg_class is GlobalArg:
             sym_shape = var(arg.name).attr("shape")
             for axis_nr, shape_i in enumerate(arg.shape):
@@ -208,11 +207,13 @@ def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options
 
 # {{{ integer arg finding from offsets
 
-def generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options):
+def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info):
+    options = kernel.options
+
     gen("# {{{ find integer arguments from offsets")
     gen("")
 
-    for arg in impl_arg_info:
+    for arg in implemented_data_info:
         impl_array_name = arg.offset_for_name
         if impl_array_name is not None:
             gen("if %s is None:" % arg.name)
@@ -255,11 +256,13 @@ def generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, option
 
 # {{{ integer arg finding from strides
 
-def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options):
+def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info):
+    options = kernel.options
+
     gen("# {{{ find integer arguments from strides")
     gen("")
 
-    for arg in impl_arg_info:
+    for arg in implemented_data_info:
         if arg.stride_for_name_and_axis is not None:
             impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
 
@@ -297,156 +300,9 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option
 # }}}
 
 
-# {{{ value arg setup
-
-def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
-    import loopy as lp
-    from loopy.kernel.array import ArrayBase
-
-    # {{{ arg counting bug handling
-
-    # For example:
-    # https://github.com/pocl/pocl/issues/197
-    # (but Apple CPU has a similar bug)
-
-    work_around_arg_count_bug = False
-    warn_about_arg_count_bug = False
-
-    devices = cl_kernel.context.devices
-
-    try:
-        from pyopencl.characterize import has_struct_arg_count_bug
-
-    except ImportError:
-        count_bug_per_dev = [False]*len(devices)
-
-    else:
-        count_bug_per_dev = [
-                has_struct_arg_count_bug(dev)
-                for dev in devices]
-
-    if any(count_bug_per_dev):
-        if all(count_bug_per_dev):
-            work_around_arg_count_bug = True
-        else:
-            warn_about_arg_count_bug = True
-
-    # }}}
-
-    cl_arg_idx = 0
-    arg_idx_to_cl_arg_idx = {}
-
-    fp_arg_count = 0
-
-    for arg_idx, arg in enumerate(impl_arg_info):
-        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
-
-        if arg.arg_class is not lp.ValueArg:
-            assert issubclass(arg.arg_class, ArrayBase)
-
-            # assume each of those generates exactly one...
-            cl_arg_idx += 1
-
-            continue
-
-        gen("# {{{ process %s" % arg.name)
-        gen("")
-
-        if not options.skip_arg_checks:
-            gen("""
-                if {name} is None:
-                    raise RuntimeError("input argument '{name}' must "
-                        "be supplied")
-                """.format(name=arg.name))
-
-        if sys.version_info < (2, 7) and arg.dtype.is_integral():
-            gen("# cast to long to avoid trouble with struct packing")
-            gen("%s = long(%s)" % (arg.name, arg.name))
-            gen("")
-
-        if arg.dtype.is_composite():
-            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
-            cl_arg_idx += 1
-
-        elif arg.dtype.is_complex():
-            assert isinstance(arg.dtype, NumpyType)
-
-            dtype = arg.dtype
-
-            if warn_about_arg_count_bug:
-                from warnings import warn
-                warn("{knl_name}: arguments include complex numbers, and "
-                        "some (but not all) of the target devices mishandle "
-                        "struct kernel arguments (hence the workaround is "
-                        "disabled".format(
-                            knl_name=kernel.name))
-
-            if dtype.numpy_dtype == np.complex64:
-                arg_char = "f"
-            elif dtype.numpy_dtype == np.complex128:
-                arg_char = "d"
-            else:
-                raise TypeError("unexpected complex type: %s" % dtype)
-
-            if (work_around_arg_count_bug
-                    and dtype.numpy_dtype == np.complex128
-                    and fp_arg_count + 2 <= 8):
-                gen(
-                        "buf = _lpy_pack('{arg_char}', {arg_var}.real)"
-                        .format(arg_char=arg_char, arg_var=arg.name))
-                gen(
-                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
-                        .format(cl_arg_idx=cl_arg_idx))
-                cl_arg_idx += 1
-
-                gen(
-                        "buf = _lpy_pack('{arg_char}', {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg.name))
-                gen(
-                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
-                        .format(cl_arg_idx=cl_arg_idx))
-                cl_arg_idx += 1
-            else:
-                gen(
-                        "buf = _lpy_pack('{arg_char}{arg_char}', "
-                        "{arg_var}.real, {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg.name))
-                gen(
-                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
-                        .format(cl_arg_idx=cl_arg_idx))
-                cl_arg_idx += 1
-
-            fp_arg_count += 2
-
-        elif isinstance(arg.dtype, NumpyType):
-            if arg.dtype.dtype.kind == "f":
-                fp_arg_count += 1
-
-            gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))"
-                    % (cl_arg_idx, arg.dtype.dtype.char, arg.name))
-
-            cl_arg_idx += 1
-
-        else:
-            raise LoopyError("do not know how to pass argument of type '%s'"
-                    % arg.dtype)
-
-        gen("")
-
-        gen("# }}}")
-        gen("")
-
-    assert cl_arg_idx == cl_kernel.num_args
-
-    return arg_idx_to_cl_arg_idx
-
-# }}}
-
-
-# {{{ array arg setup
+# {{{ arg setup
 
-def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
-        arg_idx_to_cl_arg_idx):
+def generate_arg_setup(gen, kernel, implemented_data_info, options):
     import loopy as lp
 
     from loopy.kernel.array import ArrayBase
@@ -461,13 +317,16 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
         gen("_lpy_encountered_dev = False")
         gen("")
 
+    args = []
+
     strify = StringifyMapper()
 
-    for arg_idx, arg in enumerate(impl_arg_info):
+    for arg_idx, arg in enumerate(implemented_data_info):
         is_written = arg.base_name in kernel.get_written_variables()
         kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
 
         if not issubclass(arg.arg_class, ArrayBase):
+            args.append(arg.name)
             continue
 
         gen("# {{{ process %s" % arg.name)
@@ -667,12 +526,11 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
             gen("del _lpy_made_by_loopy")
             gen("")
 
-        cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx]
-
         if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-            gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name))
+            args.append("%s.base_data" % arg.name)
         else:
-            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
+            args.append("%s" % arg.name)
+
         gen("")
 
         gen("# }}}")
@@ -681,19 +539,23 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
     gen("# }}}")
     gen("")
 
+    return args
+
 # }}}
 
 
-def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
+def generate_invoker(kernel, implemented_data_info, host_code):
+    options = kernel.options
+
     system_args = [
-            "cl_kernel", "queue", "allocator=None", "wait_for=None",
+            "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
             # ignored if options.no_numpy
             "out_host=None"
             ]
 
     gen = PythonFunctionGenerator(
             "invoke_%s_loopy_kernel" % kernel.name,
-            system_args + ["%s=None" % iai.name for iai in impl_arg_info])
+            system_args + ["%s=None" % iai.name for iai in implemented_data_info])
 
     gen.add_to_preamble("from __future__ import division")
     gen.add_to_preamble("")
@@ -701,7 +563,8 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
     gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
     gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
     gen.add_to_preamble("import numpy as _lpy_np")
-    gen.add_to_preamble("from struct import pack as _lpy_pack")
+    gen.add_to_preamble("")
+    gen.add_to_preamble(host_code)
     gen.add_to_preamble("")
 
     gen("if allocator is None:")
@@ -709,37 +572,21 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
         gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
     gen("")
 
-    generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options)
-    generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options)
-    generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options)
+    generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info)
+    generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info)
+    generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info)
 
-    arg_idx_to_cl_arg_idx = \
-            generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options)
-    generate_array_arg_setup(gen, kernel, impl_arg_info, options,
-            arg_idx_to_cl_arg_idx)
+    args = generate_arg_setup(gen, kernel, implemented_data_info, options)
 
     # {{{ generate invocation
 
-    from loopy.symbolic import StringifyMapper
-
-    strify = StringifyMapper()
-    gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs()
-
-    if not gsize_expr:
-        gsize_expr = (1,)
-    if not lsize_expr:
-        lsize_expr = (1,)
-
-    def strify_tuple(t):
-        return "(%s,)" % (
-                ", ".join("int(%s)" % strify(t_i) for t_i in t))
-
-    gen("_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, "
-            "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
-            % dict(
-                gsize=strify_tuple(gsize_expr),
-                lsize=strify_tuple(lsize_expr)))
-    gen("")
+    gen("_lpy_evt = {kernel_name}({args})"
+            .format(
+                kernel_name=kernel.name,
+                args=", ".join(
+                    ["_lpy_cl_kernels", "queue"]
+                    + args
+                    + ["wait_for=wait_for"])))
 
     # }}}
 
@@ -754,7 +601,7 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
         gen("if out_host:")
         with Indentation(gen):
             gen("pass")  # if no outputs (?!)
-            for arg_idx, arg in enumerate(impl_arg_info):
+            for arg_idx, arg in enumerate(implemented_data_info):
                 is_written = arg.base_name in kernel.get_written_variables()
                 if is_written:
                     gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
@@ -764,11 +611,11 @@ def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
     if options.return_dict:
         gen("return _lpy_evt, {%s}"
                 % ", ".join("\"%s\": %s" % (arg.name, arg.name)
-                    for arg in impl_arg_info
+                    for arg in implemented_data_info
                     if arg.base_name in kernel.get_written_variables()))
     else:
         out_args = [arg
-                for arg in impl_arg_info
+                for arg in implemented_data_info
                 if arg.base_name in kernel.get_written_variables()]
         if out_args:
             gen("return _lpy_evt, (%s,)"
@@ -801,6 +648,10 @@ class _CLKernelInfo(Record):
     pass
 
 
+class _CLKernels(object):
+    pass
+
+
 class CompiledKernel:
     """An object connecting a kernel to a :class:`pyopencl.Context`
     for execution.
@@ -819,7 +670,9 @@ class CompiledKernel:
         """
 
         self.context = context
-        self.kernel = kernel
+
+        from loopy.target.pyopencl import PyOpenCLTarget
+        self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
 
         self.packing_controller = SeparateArrayPackingController(kernel)
 
@@ -865,11 +718,13 @@ class CompiledKernel:
     def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
 
-        from loopy.codegen import generate_code
-        code, impl_arg_info = generate_code(kernel)
+        from loopy.codegen import generate_code_v2
+        codegen_result = generate_code_v2(kernel)
+
+        dev_code = codegen_result.device_code()
 
         if self.kernel.options.write_cl:
-            output = code
+            output = dev_code
             if self.kernel.options.highlight_cl:
                 output = get_highlighted_cl_code(output)
 
@@ -881,23 +736,31 @@ class CompiledKernel:
 
         if self.kernel.options.edit_cl:
             from pytools import invoke_editor
-            code = invoke_editor(code, "code.cl")
+            dev_code = invoke_editor(dev_code, "code.cl")
 
         import pyopencl as cl
 
         logger.info("%s: opencl compilation start" % self.kernel.name)
-        cl_program = cl.Program(self.context, code)
-        cl_kernel = getattr(
-                cl_program.build(options=kernel.options.cl_build_options),
-                kernel.name)
+
+        cl_program = (
+                cl.Program(self.context, dev_code)
+                .build(options=kernel.options.cl_build_options))
+
+        cl_kernels = _CLKernels()
+        for dp in codegen_result.device_programs:
+            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
+
         logger.info("%s: opencl compilation done" % self.kernel.name)
 
         return _CLKernelInfo(
                 kernel=kernel,
-                cl_kernel=cl_kernel,
-                impl_arg_info=impl_arg_info,
+                cl_kernels=cl_kernels,
+                implemented_data_info=codegen_result.implemented_data_info,
                 invoker=generate_invoker(
-                    kernel, cl_kernel, impl_arg_info, self.kernel.options))
+                    kernel,
+                    codegen_result.implemented_data_info,
+                    codegen_result.host_code(),
+                    ))
 
     # {{{ debugging aids
 
@@ -907,9 +770,9 @@ class CompiledKernel:
 
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
 
-        from loopy.codegen import generate_code
-        code, arg_info = generate_code(kernel)
-        return code
+        from loopy.codegen import generate_code_v2
+        code = generate_code_v2(kernel)
+        return code.device_code()
 
     def get_highlighted_code(self, arg_to_dtype=None):
         return get_highlighted_cl_code(
@@ -979,7 +842,7 @@ class CompiledKernel:
                 frozenset(six.iteritems(arg_to_dtype)))
 
         return kernel_info.invoker(
-                kernel_info.cl_kernel, queue, allocator, wait_for,
+                kernel_info.cl_kernels, queue, allocator, wait_for,
                 out_host, **kwargs)
 
 # }}}
diff --git a/loopy/expression.py b/loopy/expression.py
index 6fd49661d617f65d2cdbc5773c3a59955da8c19c..991f4a93e30a76a09b527e4fd326cfafff5e7569 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -231,7 +231,10 @@ class TypeInferenceMapper(CombineMapper):
         if expr.name in self.kernel.all_inames():
             return self.kernel.index_dtype
 
-        result = self.kernel.mangle_symbol(expr.name)
+        result = self.kernel.mangle_symbol(
+                self.kernel.target.get_device_ast_builder(),
+                expr.name)
+
         if result is not None:
             result_dtype, _ = result
             return result_dtype
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 7baea324346b79e40d7ca601a99644727dd66e95..9b2c896ac2e8defc20bb54ea5c0682b68a363263 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -198,7 +198,7 @@ class LoopKernel(RecordWithoutPickling):
             # When kernels get intersected in slab decomposition,
             # their grid sizes shouldn't change. This provides
             # a way to forward sub-kernel grid size requests.
-            get_grid_sizes=None):
+            get_grid_sizes_for_insn_ids=None):
 
         if cache_manager is None:
             from loopy.kernel.tools import SetOperationCacheManager
@@ -264,9 +264,9 @@ class LoopKernel(RecordWithoutPickling):
         if np.iinfo(index_dtype.numpy_dtype).min >= 0:
             raise TypeError("index_dtype must be signed")
 
-        if get_grid_sizes is not None:
+        if get_grid_sizes_for_insn_ids is not None:
             # overwrites method down below
-            self.get_grid_sizes = get_grid_sizes
+            self.get_grid_sizes_for_insn_ids = get_grid_sizes_for_insn_ids
 
         if state not in [
                 kernel_state.INITIAL,
@@ -307,8 +307,11 @@ class LoopKernel(RecordWithoutPickling):
 
     # {{{ function mangling
 
-    def mangle_function(self, identifier, arg_dtypes):
-        manglers = self.target.function_manglers() + self.function_manglers
+    def mangle_function(self, identifier, arg_dtypes, ast_builder=None):
+        if ast_builder is None:
+            ast_builder = self.target.get_device_ast_builder()
+
+        manglers = ast_builder.function_manglers() + self.function_manglers
 
         for mangler in manglers:
             mangle_result = mangler(self, identifier, arg_dtypes)
@@ -349,8 +352,8 @@ class LoopKernel(RecordWithoutPickling):
 
     # {{{ symbol mangling
 
-    def mangle_symbol(self, identifier):
-        manglers = self.target.symbol_manglers() + self.symbol_manglers
+    def mangle_symbol(self, ast_builder, identifier):
+        manglers = ast_builder.symbol_manglers() + self.symbol_manglers
 
         for mangler in manglers:
             result = mangler(self, identifier)
@@ -895,9 +898,20 @@ class LoopKernel(RecordWithoutPickling):
                 constants_only=True)))
 
     @memoize_method
-    def get_grid_sizes(self, ignore_auto=False):
+    def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of all instructions whose IDs are given
+        in *insn_ids*.
+
+        :arg insn_ids: a :class:`frozenset` of instruction IDs
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+
         all_inames_by_insns = set()
-        for insn in self.instructions:
+        for insn_id in insn_ids:
+            insn = self.id_to_insn[insn_id]
+
             all_inames_by_insns |= self.insn_inames(insn)
 
         if not all_inames_by_insns <= self.all_inames():
@@ -973,8 +987,18 @@ class LoopKernel(RecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
-    def get_grid_sizes_as_exprs(self, ignore_auto=False):
-        grid_size, group_size = self.get_grid_sizes(ignore_auto)
+    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of all instructions whose IDs are given
+        in *insn_ids*.
+
+        :arg insn_ids: a :class:`frozenset` of instruction IDs
+
+        *global_size* and *local_size* are :mod:`pymbolic` expressions
+        """
+
+        grid_size, group_size = self.get_grid_sizes_for_insn_ids(
+                insn_ids, ignore_auto)
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
@@ -982,6 +1006,27 @@ class LoopKernel(RecordWithoutPickling):
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
+    def get_grid_size_upper_bounds(self, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of *all* instructions in the kernel.
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+        return self.get_grid_sizes_for_insn_ids(
+                frozenset(insn.id for insn in self.instructions),
+                ignore_auto=ignore_auto)
+
+    def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of *all* instructions in the kernel.
+
+        *global_size* and *local_size* are :mod:`pymbolic` expressions
+        """
+
+        return self.get_grid_sizes_for_insn_ids_as_exprs(
+                frozenset(insn.id for insn in self.instructions),
+                ignore_auto=ignore_auto)
+
     # }}}
 
     # {{{ local memory
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index b82965914a2fd6eabb460e3e68f31131862c6330..988636ebffb318a34dbb61dfdfb1f5d4c523bc77 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -991,7 +991,6 @@ class ArrayBase(Record):
                 # generate stride arguments, yielded later to keep array first
                 for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \
                         in stride_arg_axes:
-                    from cgen import Const, POD
                     stride_name = full_name+"_stride%d" % stride_user_axis
 
                     from pymbolic import var
@@ -1004,20 +1003,16 @@ class ArrayBase(Record):
                                 target=target,
                                 name=stride_name,
                                 dtype=index_dtype,
-                                cgen_declarator=Const(POD(index_dtype, stride_name)),
                                 arg_class=ValueArg,
                                 stride_for_name_and_axis=(
-                                    full_name, stride_impl_axis)))
+                                    full_name, stride_impl_axis),
+                                is_written=False))
 
                 yield ImplementedDataInfo(
                             target=target,
                             name=full_name,
                             base_name=self.name,
 
-                            # implemented by various argument types
-                            cgen_declarator=self.get_arg_decl(
-                                target, name_suffix, shape, dtype, is_written),
-
                             arg_class=type(self),
                             dtype=dtype,
                             shape=shape,
@@ -1025,18 +1020,18 @@ class ArrayBase(Record):
                             unvec_shape=unvec_shape,
                             unvec_strides=tuple(unvec_strides),
                             allows_offset=bool(self.offset),
-                            )
+
+                            is_written=is_written)
 
                 if self.offset:
-                    from cgen import Const, POD
                     offset_name = full_name+"_offset"
                     yield ImplementedDataInfo(
                                 target=target,
                                 name=offset_name,
                                 dtype=index_dtype,
-                                cgen_declarator=Const(POD(index_dtype, offset_name)),
                                 arg_class=ValueArg,
-                                offset_for_name=full_name)
+                                offset_for_name=full_name,
+                                is_written=False)
 
                 for sa in stride_args:
                     yield sa
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index b477b5fee0b047d0fb74099174e2a8c6f08336d1..8de5919df44a6201a4cfb123fa1a34516132634e 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -202,8 +202,8 @@ class GlobalArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
-        return target.get_global_arg_decl(self.name + name_suffix, shape,
+    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
+        return ast_builder.get_global_arg_decl(self.name + name_suffix, shape,
                 dtype, is_written)
 
 
@@ -211,8 +211,8 @@ class ConstantArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
-        return target.get_constant_arg_decl(self.name + name_suffix, shape,
+    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
+        return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape,
                 dtype, is_written)
 
 
@@ -224,8 +224,8 @@ class ImageArg(ArrayBase, KernelArgument):
     def dimensions(self):
         return len(self.dim_tags)
 
-    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
-        return target.get_image_arg_decl(self.name + name_suffix, shape,
+    def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
+        return ast_builder.get_image_arg_decl(self.name + name_suffix, shape,
                 self.num_target_axes(), dtype, is_written)
 
 
@@ -258,8 +258,8 @@ class ValueArg(KernelArgument):
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.dtype)
 
-    def get_arg_decl(self, target):
-        return target.get_value_arg_decl(self.name, (),
+    def get_arg_decl(self, ast_builder):
+        return ast_builder.get_value_arg_decl(self.name, (),
                 self.dtype, False)
 
 # }}}
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 0a02979ab32ad45cc4b1f6519c2bd327fa150812..7862ceb1efbed4f70ebe0fa81d07de84b12fca60 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -393,7 +393,8 @@ class DomainChanger:
 
                 # Changing the domain might look like it wants to change grid
                 # sizes. Not true.
-                get_grid_sizes=self.kernel.get_grid_sizes)
+                # (Relevant for 'slab decomposition')
+                get_grid_sizes_for_insn_ids=self.kernel.get_grid_sizes_for_insn_ids)
 
 # }}}
 
@@ -767,7 +768,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     # copies.
 
     if local_size is None:
-        _, local_size = kernel.get_grid_sizes_as_exprs(
+        _, local_size = kernel.get_grid_size_upper_bounds_as_exprs(
                 ignore_auto=True)
 
     # {{{ axis assignment helper function
diff --git a/loopy/schedule.py b/loopy/schedule/__init__.py
similarity index 97%
rename from loopy/schedule.py
rename to loopy/schedule/__init__.py
index 7ac8778378869f7c9cf6cfbe8d4be145e9fad520..38fc7c5812b9105904a9cb156527bc074fdc6728 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule/__init__.py
@@ -50,11 +50,19 @@ class ScheduleItem(Record):
             key_builder.rec(key_hash, getattr(self, field_name))
 
 
-class EnterLoop(ScheduleItem):
+class BeginBlockItem(ScheduleItem):
+    pass
+
+
+class EndBlockItem(ScheduleItem):
+    pass
+
+
+class EnterLoop(BeginBlockItem):
     hash_fields = __slots__ = ["iname"]
 
 
-class LeaveLoop(ScheduleItem):
+class LeaveLoop(EndBlockItem):
     hash_fields = __slots__ = ["iname"]
 
 
@@ -62,11 +70,11 @@ class RunInstruction(ScheduleItem):
     hash_fields = __slots__ = ["insn_id"]
 
 
-class CallKernel(ScheduleItem):
+class CallKernel(BeginBlockItem):
     hash_fields = __slots__ = ["kernel_name"]
 
 
-class ReturnFromKernel(ScheduleItem):
+class ReturnFromKernel(EndBlockItem):
     hash_fields = __slots__ = ["kernel_name"]
 
 
@@ -87,15 +95,15 @@ class Barrier(ScheduleItem):
 
 # {{{ schedule utilities
 
-def gather_schedule_subloop(schedule, start_idx):
-    assert isinstance(schedule[start_idx], EnterLoop)
+def gather_schedule_block(schedule, start_idx):
+    assert isinstance(schedule[start_idx], BeginBlockItem)
     level = 0
 
     i = start_idx
     while i < len(schedule):
-        if isinstance(schedule[i], EnterLoop):
+        if isinstance(schedule[i], BeginBlockItem):
             level += 1
-        if isinstance(schedule[i], LeaveLoop):
+        elif isinstance(schedule[i], EndBlockItem):
             level -= 1
 
             if level == 0:
@@ -107,16 +115,17 @@ def gather_schedule_subloop(schedule, start_idx):
 
 
 def generate_sub_sched_items(schedule, start_idx):
-    if not isinstance(schedule[start_idx], EnterLoop):
+    if not isinstance(schedule[start_idx], BeginBlockItem):
         yield start_idx, schedule[start_idx]
 
     level = 0
     i = start_idx
     while i < len(schedule):
         sched_item = schedule[i]
-        if isinstance(sched_item, EnterLoop):
+        if isinstance(sched_item, BeginBlockItem):
             level += 1
-        elif isinstance(sched_item, LeaveLoop):
+
+        elif isinstance(sched_item, EndBlockItem):
             level -= 1
 
         else:
@@ -130,6 +139,14 @@ def generate_sub_sched_items(schedule, start_idx):
     assert False
 
 
+def get_insn_ids_for_block_at(schedule, start_idx):
+    return frozenset(
+            sub_sched_item.insn_id
+            for sub_sched_item in generate_sub_sched_items(
+                schedule, start_idx)
+            if isinstance(sub_sched_item, RunInstruction))
+
+
 def find_active_inames_at(kernel, sched_index):
     active_inames = []
 
@@ -146,8 +163,8 @@ def find_active_inames_at(kernel, sched_index):
 def has_barrier_within(kernel, sched_index):
     sched_item = kernel.schedule[sched_index]
 
-    if isinstance(sched_item, EnterLoop):
-        loop_contents, _ = gather_schedule_subloop(
+    if isinstance(sched_item, BeginBlockItem):
+        loop_contents, _ = gather_schedule_block(
                 kernel.schedule, sched_index)
         from pytools import any
         return any(isinstance(subsched_item, Barrier)
@@ -161,8 +178,8 @@ def has_barrier_within(kernel, sched_index):
 def find_used_inames_within(kernel, sched_index):
     sched_item = kernel.schedule[sched_index]
 
-    if isinstance(sched_item, EnterLoop):
-        loop_contents, _ = gather_schedule_subloop(
+    if isinstance(sched_item, BeginBlockItem):
+        loop_contents, _ = gather_schedule_block(
                 kernel.schedule, sched_index)
         run_insns = [subsched_item
                 for subsched_item in loop_contents
@@ -1259,7 +1276,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
         if isinstance(sched_item, EnterLoop):
             # {{{ recurse for nested loop
 
-            subloop, new_i = gather_schedule_subloop(schedule, i)
+            subloop, new_i = gather_schedule_block(schedule, i)
             i = new_i
 
             # Run barrier insertion for inner loop
@@ -1457,7 +1474,7 @@ def generate_loop_schedules(kernel, debug_args={}):
             for gen_sched in gen:
                 debug.stop()
 
-                gsize, lsize = kernel.get_grid_sizes_as_exprs()
+                gsize, lsize = kernel.get_grid_size_upper_bounds()
 
                 if gsize or lsize:
                     logger.info("%s: barrier insertion: global" % kernel.name)
@@ -1481,9 +1498,15 @@ def generate_loop_schedules(kernel, debug_args={}):
 
                     logger.info("%s: barrier insertion: done" % kernel.name)
 
-                yield kernel.copy(
+                new_kernel = kernel.copy(
                         schedule=gen_sched,
                         state=kernel_state.SCHEDULED)
+
+                from loopy.codegen.device_mapping import \
+                        map_schedule_onto_host_or_device
+                new_kernel = map_schedule_onto_host_or_device(new_kernel)
+                yield new_kernel
+
                 debug.start()
 
                 schedule_count += 1
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 219d66d49a47882bf76bd8c89f7298c8597dfa0e..cade5e5e060b26a8c6d44965e61500baa31e5c08 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -33,8 +33,7 @@ from pytools import memoize, memoize_method, Record
 import pytools.lex
 
 from pymbolic.primitives import (
-        Leaf, AlgebraicLeaf, Expression, Variable,
-        CommonSubexpression)
+        Leaf, AlgebraicLeaf, Expression, Variable, CommonSubexpression)
 
 from pymbolic.mapper import (
         CombineMapper as CombineMapperBase,
@@ -1094,7 +1093,7 @@ def ineq_constraint_from_expr(space, expr):
     return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr))
 
 
-def constraint_to_expr(cns, except_name=None):
+def constraint_to_expr(cns):
     # Looks like this is ok after all--get_aff() performs some magic.
     # Not entirely sure though... FIXME
     #
@@ -1102,7 +1101,13 @@ def constraint_to_expr(cns, except_name=None):
     #if ls.dim(dim_type.div):
         #raise RuntimeError("constraint has an existentially quantified variable")
 
-    return aff_to_expr(cns.get_aff(), except_name=except_name)
+    expr = aff_to_expr(cns.get_aff())
+
+    from pymbolic.primitives import Comparison
+    if cns.is_equality():
+        return Comparison(expr, "==", 0)
+    else:
+        return Comparison(expr, ">=", 0)
 
 # }}}
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index cad451c2ed041875f9dd9535d73ce20e8ac9e057..d9f89ca3571b31cb724e5a598918525cf2b4b665 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -29,6 +29,8 @@ __doc__ = """
 .. currentmodule:: loopy
 
 .. autoclass:: TargetBase
+.. autoclass:: ASTBuilderBase
+
 .. autoclass:: CTarget
 .. autoclass:: CudaTarget
 .. autoclass:: OpenCLTarget
@@ -39,7 +41,7 @@ __doc__ = """
 
 
 class TargetBase(object):
-    """Base class for all targets, i.e. different types of code that
+    """Base class for all targets, i.e. different combinations of code that
     loopy can generate.
 
     Objects of this type must be picklable.
@@ -69,19 +71,6 @@ class TargetBase(object):
 
     # }}}
 
-    # {{{ library
-
-    def function_manglers(self):
-        return []
-
-    def symbol_manglers(self):
-        return []
-
-    def preamble_generators(self):
-        return []
-
-    # }}}
-
     # {{{ top-level codegen
 
     def preprocess(self, kernel):
@@ -90,11 +79,30 @@ class TargetBase(object):
     def pre_codegen_check(self, kernel):
         pass
 
-    def generate_code(self, kernel, codegen_state, impl_arg_info):
-        pass
-
     # }}}
 
+    host_program_name_suffix = "_outer"
+    device_program_name_suffix = ""
+
+    def split_kernel_at_global_barriers(self):
+        """
+        :returns: a :class:`bool` indicating whether the kernel should
+            be split when a global barrier is encountered.
+        """
+        raise NotImplementedError()
+
+    def get_host_ast_builder(self):
+        """
+        :returns: a class implementing :class:`ASTBuilderBase` for the host code
+        """
+        raise NotImplementedError()
+
+    def get_device_ast_builder(self):
+        """
+        :returns: a class implementing :class:`ASTBuilderBase` for the host code
+        """
+        raise NotImplementedError()
+
     # {{{ types
 
     def get_dtype_registry(self):
@@ -112,8 +120,47 @@ class TargetBase(object):
 
     # }}}
 
+
+class ASTBuilderBase(object):
+    """An interface for generating (host or device) ASTs.
+    """
+
+    def __init__(self, target):
+        self.target = target
+
+    # {{{ library
+
+    def function_manglers(self):
+        return []
+
+    def symbol_manglers(self):
+        return []
+
+    def preamble_generators(self):
+        return []
+
+    # }}}
+
     # {{{ code generation guts
 
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index, function_decl, function_body):
+        raise NotImplementedError
+
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        raise NotImplementedError
+
+    def get_temporary_decls(self, codegen_state):
+        raise NotImplementedError
+
+    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+        raise NotImplementedError
+
+    @property
+    def ast_block_class(self):
+        raise NotImplementedError()
+
     def get_expression_to_code_mapper(self, codegen_state):
         raise NotImplementedError()
 
@@ -123,7 +170,6 @@ class TargetBase(object):
     def emit_barrier(self, kind, comment):
         """
         :arg kind: ``"local"`` or ``"global"``
-        :return: a :class:`loopy.codegen.GeneratedInstruction`.
         """
         raise NotImplementedError()
 
@@ -133,13 +179,101 @@ class TargetBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
-    def generate_multiple_assignment(self, codegen_state, insn):
+    def emit_assignment(self, codegen_state, lhs, rhs):
         raise NotImplementedError()
 
-    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
+    def emit_multiple_assignment(self, codegen_state, insn):
+        raise NotImplementedError()
+
+    def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic update in target %s" % type(self).__name__)
 
+    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
+            static_lbound, static_ubound, inner):
+        raise NotImplementedError()
+
+    def emit_if(self, condition_str, ast):
+        raise NotImplementedError()
+
+    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+        raise NotImplementedError()
+
+    def emit_blank_line(self):
+        raise NotImplementedError()
+
+    def emit_comment(self, s):
+        raise NotImplementedError()
+
     # }}}
 
+
+# {{{ dummy host ast builder
+
+class _DummyExpressionToCodeMapper(object):
+    def rec(self, expr, prec, type_context=None, needed_dtype=None):
+        return ""
+
+    __call__ = rec
+
+
+class _DummyASTBlock(object):
+    def __init__(self, arg):
+        self.contents = []
+
+    def __str__(self):
+        return ""
+
+
+class DummyHostASTBuilder(ASTBuilderBase):
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index, function_decl, function_body):
+        return function_body
+
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        return None
+
+    def get_temporary_decls(self, codegen_state):
+        return []
+
+    def get_expression_to_code_mapper(self, codegen_state):
+        return _DummyExpressionToCodeMapper()
+
+    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+        return None
+
+    @property
+    def ast_block_class(self):
+        return _DummyASTBlock
+
+    def emit_assignment(self, codegen_state, lhs, rhs):
+        return None
+
+    def emit_multiple_assignment(self, codegen_state, insn):
+        return None
+
+    def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
+            lhs_expr, rhs_expr, lhs_dtype):
+        return None
+
+    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
+            static_lbound, static_ubound, inner):
+        return None
+
+    def emit_if(self, condition_str, ast):
+        return None
+
+    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+        return None
+
+    def emit_blank_line(self):
+        return None
+
+    def emit_comment(self, s):
+        return None
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 5e35075623c346f908468ae3855f78883da8d652..f87b999e32890b28043fd3c6eeafc6488eebafc2 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 import six
 
 import numpy as np  # noqa
-from loopy.target import TargetBase
+from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError
 
 from pytools import memoize_method
@@ -91,6 +91,43 @@ def _preamble_generator(preamble_info):
 # }}}
 
 
+# {{{ cgen overrides
+
+from cgen import Declarator
+
+
+class POD(Declarator):
+    """A simple declarator: The type is given as a :class:`numpy.dtype`
+    and the *name* is given as a string.
+    """
+
+    def __init__(self, ast_builder, dtype, name):
+        from loopy.types import LoopyType
+        assert isinstance(dtype, LoopyType)
+
+        self.ast_builder = ast_builder
+        self.ctype = ast_builder.target.dtype_to_typename(dtype)
+        self.dtype = dtype
+        self.name = name
+
+    def get_decl_pair(self):
+        return [self.ctype], self.name
+
+    def struct_maker_code(self, name):
+        return name
+
+    def struct_format(self):
+        return self.dtype.char
+
+    def alignment_requirement(self):
+        return self.ast_builder.target.alignment_requirement(self)
+
+    def default_value(self):
+        return 0
+
+# }}}
+
+
 class CTarget(TargetBase):
     """A target for plain "C", without any parallel extensions.
     """
@@ -102,6 +139,15 @@ class CTarget(TargetBase):
         self.fortran_abi = fortran_abi
         super(CTarget, self).__init__()
 
+    def split_kernel_at_global_barriers(self):
+        return False
+
+    def get_host_ast_builder(self):
+        return DummyHostASTBuilder(self)
+
+    def get_device_ast_builder(self):
+        return CASTBuilder(self)
+
     # {{{ types
 
     @memoize_method
@@ -129,11 +175,13 @@ class CTarget(TargetBase):
 
     # }}}
 
+
+class CASTBuilder(ASTBuilderBase):
     # {{{ library
 
     def preamble_generators(self):
         return (
-                super(CTarget, self).preamble_generators() + [
+                super(CASTBuilder, self).preamble_generators() + [
                     _preamble_generator,
                     ])
 
@@ -141,35 +189,47 @@ class CTarget(TargetBase):
 
     # {{{ code generation
 
-    def generate_code(self, kernel, codegen_state, impl_arg_info):
-        from cgen import FunctionBody, FunctionDeclaration, Value, Module
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index,
+            function_decl, function_body):
+        from cgen import FunctionBody
+        return FunctionBody(function_decl, function_body)
 
-        body, implemented_domains = kernel.target.generate_body(
-                kernel, codegen_state)
+    def idi_to_cgen_declarator(self, kernel, idi):
+        if (idi.offset_for_name is not None
+                or idi.stride_for_name_and_axis is not None):
+            assert not idi.is_written
+            from cgen import Const
+            return Const(POD(self, idi.dtype, idi.name))
+        else:
+            name = idi.base_name or idi.name
+            arg = kernel.arg_dict[name]
+            from loopy.kernel.data import ArrayBase
+            if isinstance(arg, ArrayBase):
+                return arg.get_arg_decl(
+                        self,
+                        idi.name[len(name):], idi.shape, idi.dtype,
+                        idi.is_written)
+            else:
+                return arg.get_arg_decl(self)
 
-        name = kernel.name
-        if self.fortran_abi:
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        from cgen import FunctionDeclaration, Value
+
+        name = codegen_result.current_program(codegen_state).name
+        if self.target.fortran_abi:
             name += "_"
 
-        mod = Module([
-            FunctionBody(
-                kernel.target.wrap_function_declaration(
-                    kernel,
-                    FunctionDeclaration(
+        return FunctionDeclaration(
                         Value("void", name),
-                        [iai.cgen_declarator for iai in impl_arg_info])),
-                body)
-            ])
-
-        return str(mod), implemented_domains
+                        [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
+                            for idi in codegen_state.implemented_data_info])
 
-    def wrap_function_declaration(self, kernel, fdecl):
-        return fdecl
-
-    def generate_body(self, kernel, codegen_state):
-        from cgen import Block
-        body = Block()
+    def get_temporary_decls(self, codegen_state):
+        kernel = codegen_state.kernel
 
+        base_storage_decls = []
         temp_decls = []
 
         # {{{ declare temporaries
@@ -179,7 +239,6 @@ class CTarget(TargetBase):
         base_storage_to_align_bytes = {}
 
         from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value
-        from loopy.codegen import POD  # uses the correct complex type
 
         class ConstRestrictPointer(Pointer):
             def get_decl_pair(self):
@@ -189,7 +248,7 @@ class CTarget(TargetBase):
         for tv in sorted(
                 six.itervalues(kernel.temporary_variables),
                 key=lambda tv: tv.name):
-            decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)
+            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)
 
             if not tv.base_storage:
                 for idi in decl_info:
@@ -255,35 +314,27 @@ class CTarget(TargetBase):
             alignment = max(base_storage_to_align_bytes[bs_name])
             bs_var_decl = AlignedAttribute(alignment, bs_var_decl)
 
-            body.append(bs_var_decl)
-
-        body.extend(temp_decls)
+            base_storage_decls.append(bs_var_decl)
 
         # }}}
 
-        from loopy.codegen.loop import set_up_hw_parallel_loops
-        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)
-
-        from cgen import Line
-        body.append(Line())
-
-        if isinstance(gen_code.ast, Block):
-            body.extend(gen_code.ast.contents)
-        else:
-            body.append(gen_code.ast)
+        return base_storage_decls + temp_decls
 
-        return body, gen_code.implemented_domains
+    @property
+    def ast_block_class(self):
+        from cgen import Block
+        return Block
 
     # }}}
 
     # {{{ code generation guts
 
     def get_expression_to_code_mapper(self, codegen_state):
-        from loopy.target.c.codegen.expression import LoopyCCodeMapper
-        return LoopyCCodeMapper(codegen_state, fortran_abi=self.fortran_abi)
+        from loopy.target.c.codegen.expression import ExpressionToCMapper
+        return ExpressionToCMapper(
+                codegen_state, fortran_abi=self.target.fortran_abi)
 
     def get_temporary_decl(self, knl, temp_var, decl_info):
-        from loopy.codegen import POD  # uses the correct complex type
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
         if decl_info.shape:
@@ -299,20 +350,18 @@ class CTarget(TargetBase):
     def get_value_arg_decl(self, name, shape, dtype, is_written):
         assert shape == ()
 
-        from loopy.codegen import POD  # uses the correct complex type
         result = POD(self, dtype, name)
         if not is_written:
             from cgen import Const
             result = Const(result)
 
-        if self.fortran_abi:
+        if self.target.fortran_abi:
             from cgen import Pointer
             result = Pointer(result)
 
         return result
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.codegen import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
 
         arg_decl = RestrictPointer(POD(self, dtype, name))
@@ -322,26 +371,11 @@ class CTarget(TargetBase):
 
         return arg_decl
 
-    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
-        ecm = codegen_state.expression_to_code_mapper
-
-        from loopy.symbolic import aff_to_expr
-
-        from loopy.codegen import wrap_in
-        from pymbolic.mapper.stringifier import PREC_NONE
-        from cgen import For
-
-        return wrap_in(For,
-                "%s %s = %s"
-                % (self.dtype_to_typename(iname_dtype),
-                    iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
-                "%s <= %s" % (
-                    iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
-                "++%s" % iname,
-                inner)
+    def emit_assignment(self, codegen_state, lhs, rhs):
+        from cgen import Assign
+        return Assign(lhs, rhs)
 
-    def generate_multiple_assignment(self, codegen_state, insn):
+    def emit_multiple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
         from pymbolic.primitives import Variable
@@ -371,7 +405,7 @@ class CTarget(TargetBase):
         from loopy.expression import dtype_to_type_context
         str_parameters = [
                 ecm(par, PREC_NONE,
-                    dtype_to_type_context(self, tgt_dtype),
+                    dtype_to_type_context(self.target, tgt_dtype),
                     tgt_dtype)
                 for par, par_dtype, tgt_dtype in zip(
                     parameters, par_dtypes, mangle_result.arg_dtypes)]
@@ -389,7 +423,7 @@ class CTarget(TargetBase):
                         "side of instruction '%s'" % (i+1, insn.id))
             str_parameters.append(
                     "&(%s)" % ecm(a, PREC_NONE,
-                        dtype_to_type_context(self, tgt_dtype),
+                        dtype_to_type_context(self.target, tgt_dtype),
                         tgt_dtype))
 
         result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
@@ -406,6 +440,46 @@ class CTarget(TargetBase):
                 lhs_code,
                 result)
 
+    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
+            static_lbound, static_ubound, inner):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from loopy.symbolic import aff_to_expr
+
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from cgen import For
+
+        return For(
+                "%s %s = %s"
+                % (self.target.dtype_to_typename(iname_dtype),
+                    iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                "%s <= %s" % (
+                    iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
+                "++%s" % iname,
+                inner)
+
+    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+        decl = POD(self, dtype, name)
+
+        from cgen import Initializer, Const
+
+        if is_const:
+            decl = Const(decl)
+
+        return Initializer(decl, val_str)
+
+    def emit_blank_line(self):
+        from cgen import Line
+        return Line()
+
+    def emit_comment(self, s):
+        from cgen import Comment
+        return Comment(s)
+
+    def emit_if(self, condition_str, ast):
+        from cgen import If
+        return If(condition_str, ast)
+
     # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 11686dcdc3b691dbab6dbdc716729637e6dc96b7..58cf06e1250477b9ff8d41a81c9f34565b7d30f5 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -29,7 +29,8 @@ import numpy as np
 
 from pymbolic.mapper import RecursiveMapper
 from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
-        PREC_POWER)
+        PREC_POWER,
+        PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND)
 import islpy as isl
 
 from loopy.expression import dtype_to_type_context, TypeInferenceMapper
@@ -41,7 +42,7 @@ from loopy.types import LoopyType
 
 # {{{ C code mapper
 
-class LoopyCCodeMapper(RecursiveMapper):
+class ExpressionToCMapper(RecursiveMapper):
     def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None):
         self.kernel = codegen_state.kernel
         self.codegen_state = codegen_state
@@ -149,7 +150,7 @@ class LoopyCCodeMapper(RecursiveMapper):
             if isinstance(arg, ValueArg) and self.fortran_abi:
                 prefix = "*"
 
-        result = self.kernel.mangle_symbol(expr.name)
+        result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name)
         if result is not None:
             _, c_name = result
             return prefix + c_name
@@ -223,7 +224,7 @@ class LoopyCCodeMapper(RecursiveMapper):
                         enclosing_prec, PREC_CALL)
 
             if access_info.vector_index is not None:
-                return self.kernel.target.add_vector_access(
+                return self.codegen_state.ast_builder.add_vector_access(
                     result, access_info.vector_index)
             else:
                 return result
@@ -427,7 +428,10 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         str_parameters = None
 
-        mangle_result = self.kernel.mangle_function(identifier, par_dtypes)
+        mangle_result = self.kernel.mangle_function(
+                identifier, par_dtypes,
+                ast_builder=self.codegen_state.ast_builder)
+
         if mangle_result is None:
             raise RuntimeError("function '%s' unknown--"
                     "maybe you need to register a function mangler?"
@@ -468,6 +472,21 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
 
+    def map_logical_not(self, expr, enclosing_prec, type_context):
+        return self.parenthesize_if_needed(
+                "!" + self.rec(expr.child, PREC_UNARY, type_context),
+                enclosing_prec, PREC_UNARY)
+
+    def map_logical_and(self, expr, enclosing_prec, type_context):
+        return self.parenthesize_if_needed(
+                self.join_rec(" && ", expr.children, PREC_LOGICAL_AND, type_context),
+                enclosing_prec, PREC_LOGICAL_AND)
+
+    def map_logical_or(self, expr, enclosing_prec, type_context):
+        return self.parenthesize_if_needed(
+                self.join_rec(" || ", expr.children, PREC_LOGICAL_OR, type_context),
+                enclosing_prec, PREC_LOGICAL_OR)
+
     # {{{ deal with complex-valued variables
 
     def complex_type_name(self, dtype):
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 93b5da96fc2380d34beea350667adafc23028fac..9e9d652e1fa04066493ee14dd562b7bca1966766 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -28,8 +28,8 @@ import numpy as np
 
 from pytools import memoize_method
 
-from loopy.target.c import CTarget
-from loopy.target.c.codegen.expression import LoopyCCodeMapper
+from loopy.target.c import CTarget, CASTBuilder
+from loopy.target.c.codegen.expression import ExpressionToCMapper
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.kernel.data import temp_var_scope
@@ -139,7 +139,7 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
 
 # {{{ expression mapper
 
-class LoopyCudaCCodeMapper(LoopyCCodeMapper):
+class ExpressionToCudaCMapper(ExpressionToCMapper):
     _GRID_AXES = "xyz"
 
     @staticmethod
@@ -178,15 +178,10 @@ class CudaTarget(CTarget):
 
         super(CudaTarget, self).__init__()
 
-    # {{{ library
+    def get_device_ast_builder(self):
+        return CUDACASTBuilder(self)
 
-    def function_manglers(self):
-        return (
-                super(CudaTarget, self).function_manglers() + [
-                    cuda_function_mangler
-                    ])
-
-    # }}}
+    # {{{ types
 
     @memoize_method
     def get_dtype_registry(self):
@@ -213,17 +208,41 @@ class CudaTarget(CTarget):
 
     # }}}
 
+# }}}
+
+
+# {{{ ast builder
+
+class CUDACASTBuilder(CASTBuilder):
+    # {{{ library
+
+    def function_manglers(self):
+        return (
+                super(CudaTarget, self).function_manglers() + [
+                    cuda_function_mangler
+                    ])
+
+    # }}}
+
     # {{{ top-level codegen
 
-    def wrap_function_declaration(self, kernel, fdecl):
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        fdecl = super(CUDACASTBuilder, self).get_function_declaration(
+                codegen_state, codegen_result, schedule_index)
+
         from cgen.cuda import CudaGlobal, CudaLaunchBounds
         fdecl = CudaGlobal(fdecl)
 
-        if self.extern_c:
+        if self.target.extern_c:
             from cgen import Extern
             fdecl = Extern("C", fdecl)
 
-        _, local_grid_size = kernel.get_grid_sizes_as_exprs()
+        from loopy.schedule import get_insn_ids_for_block_at
+        _, local_grid_size = \
+                codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
+                        get_insn_ids_for_block_at(
+                            codegen_state.kernel.schedule, schedule_index))
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_grid_size):
@@ -259,7 +278,7 @@ class CudaTarget(CTarget):
     # {{{ code generation guts
 
     def get_expression_to_code_mapper(self, codegen_state):
-        return LoopyCudaCCodeMapper(codegen_state)
+        return ExpressionToCudaCMapper(codegen_state)
 
     _VEC_AXES = "xyzw"
 
@@ -275,11 +294,8 @@ class CudaTarget(CTarget):
             if comment:
                 comment = " /* %s */" % comment
 
-            from loopy.codegen import GeneratedInstruction
             from cgen import Statement
-            return GeneratedInstruction(
-                    ast=Statement("__syncthreads()%s" % comment),
-                    implemented_domain=None)
+            return Statement("__syncthreads()%s" % comment)
         elif kind == "global":
             raise LoopyError("CUDA does not have global barriers")
         else:
@@ -296,7 +312,7 @@ class CudaTarget(CTarget):
                     % scope)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.codegen import POD  # uses the correct complex type
+        from loopy.target.c import POD  # uses the correct complex type
         from cgen import Const
         from cgen.cuda import CudaRestrictPointer
 
@@ -311,7 +327,7 @@ class CudaTarget(CTarget):
         raise NotImplementedError("not yet: texture arguments in CUDA")
 
     def get_constant_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.codegen import POD  # uses the correct complex type
+        from loopy.target.c import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
         from cgen.cuda import CudaConstant
 
@@ -324,6 +340,4 @@ class CudaTarget(CTarget):
 
     # }}}
 
-# }}}
-
 # vim: foldmethod=marker
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 3a3346198bb8448e571d922dfce31642f272a3e4..896ea9158223435e3bef933818fbf3bc51a424b4 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -26,8 +26,8 @@ THE SOFTWARE.
 
 
 import numpy as np  # noqa
-from loopy.target.c import CTarget
-from loopy.target.c.codegen.expression import LoopyCCodeMapper
+from loopy.target.c import CTarget, CASTBuilder
+from loopy.target.c.codegen.expression import ExpressionToCMapper
 from loopy.diagnostic import LoopyError
 from pymbolic.mapper.stringifier import (PREC_SUM, PREC_CALL)
 
@@ -36,7 +36,7 @@ from pytools import memoize_method
 
 # {{{ expression mapper
 
-class LoopyISPCCodeMapper(LoopyCCodeMapper):
+class ExprToISPCMapper(ExpressionToCMapper):
     def _get_index_ctype(self):
         if self.kernel.index_dtype.numpy_dtype == np.int32:
             return "int32"
@@ -81,7 +81,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
             else:
                 return expr.name
         else:
-            return super(LoopyISPCCodeMapper, self).map_variable(
+            return super(ExprToISPCMapper, self).map_variable(
                     expr, enclosing_prec, type_context)
 
     def map_subscript(self, expr, enclosing_prec, type_context):
@@ -90,7 +90,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
         ary = self.find_array(expr)
 
         if isinstance(ary, TemporaryVariable):
-            gsize, lsize = self.kernel.get_grid_sizes_as_exprs()
+            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
                 lsize, = lsize
                 from loopy.kernel.array import get_access_info
@@ -113,7 +113,7 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
                 else:
                     return result
 
-        return super(LoopyISPCCodeMapper, self).map_subscript(
+        return super(ExprToISPCMapper, self).map_subscript(
                 expr, enclosing_prec, type_context)
 
 # }}}
@@ -157,6 +157,24 @@ class ISPCTarget(CTarget):
 
         super(ISPCTarget, self).__init__()
 
+    host_program_name_suffix = ""
+    device_program_name_suffix = "_inner"
+
+    def pre_codegen_check(self, kernel):
+        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs()
+        if len(lsize) > 1:
+            for i, ls_i in enumerate(lsize[1:]):
+                if ls_i != 1:
+                    raise LoopyError("local axis %d (0-based) "
+                            "has length > 1, which is unsupported "
+                            "by ISPC" % ls_i)
+
+    def get_host_ast_builder(self):
+        return ISPCASTBuilder(self)
+
+    def get_device_ast_builder(self):
+        return ISPCASTBuilder(self)
+
     # {{{ types
 
     @memoize_method
@@ -169,23 +187,20 @@ class ISPCTarget(CTarget):
 
     # }}}
 
-    # {{{ top-level codegen
 
-    def generate_code(self, kernel, codegen_state, impl_arg_info):
-        from cgen import (FunctionBody, FunctionDeclaration, Value, Module,
-                Block, Line, Statement as S)
-        from cgen.ispc import ISPCExport, ISPCTask
+class ISPCASTBuilder(CASTBuilder):
+    def _arg_names_and_decls(self, codegen_state):
+        implemented_data_info = codegen_state.implemented_data_info
+        arg_names = [iai.name for iai in implemented_data_info]
 
-        knl_body, implemented_domains = kernel.target.generate_body(
-                kernel, codegen_state)
-
-        inner_name = "lp_ispc_inner_"+kernel.name
-        arg_decls = [iai.cgen_declarator for iai in impl_arg_info]
-        arg_names = [iai.name for iai in impl_arg_info]
+        arg_decls = [
+                self.idi_to_cgen_declarator(codegen_state.kernel, idi)
+                for idi in implemented_data_info]
 
         # {{{ occa compatibility hackery
 
-        if self.occa_mode:
+        from cgen import Value
+        if self.target.occa_mode:
             from cgen import ArrayOf, Const
             from cgen.ispc import ISPCUniform
 
@@ -199,97 +214,85 @@ class ISPCTarget(CTarget):
 
         # }}}
 
-        knl_fbody = FunctionBody(
-                ISPCTask(
-                    FunctionDeclaration(
-                        Value("void", inner_name),
-                        arg_decls)),
-                knl_body)
+        return arg_names, arg_decls
 
-        # {{{ generate wrapper
+    # {{{ top-level codegen
 
-        wrapper_body = Block()
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        name = codegen_result.current_program(codegen_state).name
 
-        gsize, lsize = kernel.get_grid_sizes_as_exprs()
-        if len(lsize) > 1:
-            for i, ls_i in enumerate(lsize[1:]):
-                if ls_i != 1:
-                    raise LoopyError("local axis %d (0-based) "
-                            "has length > 1, which is unsupported "
-                            "by ISPC" % ls_i)
+        from cgen import (FunctionDeclaration, Value)
+        from cgen.ispc import ISPCExport, ISPCTask
 
-        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
-        ccm = self.get_expression_to_code_mapper(codegen_state)
+        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
+
+        if codegen_state.is_generating_device_code:
+            return ISPCTask(
+                        FunctionDeclaration(
+                            Value("void", name),
+                            arg_decls))
+        else:
+            return ISPCExport(
+                    FunctionDeclaration(
+                        Value("void", name),
+                        arg_decls))
+
+    # }}}
+
+    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+        ecm = self.get_expression_to_code_mapper(codegen_state)
 
+        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
+        result = []
+        from cgen import Statement as S, Block
         if lsize:
-            wrapper_body.append(
+            result.append(
                     S("assert(programCount == %s)"
-                        % ccm(lsize[0], PREC_COMPARISON)))
+                        % ecm(lsize[0], PREC_COMPARISON)))
 
         if gsize:
             launch_spec = "[%s]" % ", ".join(
-                                ccm(gs_i, PREC_NONE)
+                                ecm(gs_i, PREC_NONE)
                                 for gs_i in gsize)
         else:
             launch_spec = ""
 
-        wrapper_body.append(
-                S("launch%s %s(%s)"
-                    % (
-                        launch_spec,
-                        inner_name,
-                        ", ".join(arg_names)
-                        ))
-                )
-
-        wrapper_fbody = FunctionBody(
-                ISPCExport(
-                    FunctionDeclaration(
-                        Value("void", kernel.name),
-                        arg_decls)),
-                wrapper_body)
+        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)
 
-        # }}}
+        result.append(S(
+            "launch%s %s(%s)" % (
+                launch_spec,
+                name,
+                ", ".join(arg_names)
+                )))
 
-        mod = Module([
-            knl_fbody,
-            Line(),
-            wrapper_fbody,
-            ])
-
-        return str(mod), implemented_domains
-
-    # }}}
+        return Block(result)
 
     # {{{ code generation guts
 
     def get_expression_to_code_mapper(self, codegen_state):
-        return LoopyISPCCodeMapper(codegen_state)
+        return ExprToISPCMapper(codegen_state)
 
     def add_vector_access(self, access_str, index):
         return "(%s)[%d]" % (access_str, index)
 
     def emit_barrier(self, kind, comment):
-        from loopy.codegen import GeneratedInstruction
         from cgen import Comment, Statement
 
         assert comment
 
         if kind == "local":
-            return GeneratedInstruction(
-                    ast=Comment("local barrier: %s" % comment),
-                    implemented_domain=None)
+            return Comment("local barrier: %s" % comment)
 
         elif kind == "global":
-            return GeneratedInstruction(
-                    ast=Statement("sync; /* %s */" % comment),
-                    implemented_domain=None)
+            return Statement("sync; /* %s */" % comment)
 
         else:
             raise LoopyError("unknown barrier kind")
 
     def get_temporary_decl(self, knl, temp_var, decl_info):
-        from loopy.codegen import POD  # uses the correct complex type
+        from loopy.target.c import POD  # uses the correct complex type
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
         shape = decl_info.shape
@@ -311,7 +314,7 @@ class ISPCTarget(CTarget):
         return ISPCUniform(decl)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
-        from loopy.codegen import POD  # uses the correct complex type
+        from loopy.target.c import POD  # uses the correct complex type
         from cgen import Const
         from cgen.ispc import ISPCUniformPointer, ISPCUniform
 
@@ -325,7 +328,7 @@ class ISPCTarget(CTarget):
         return arg_decl
 
     def get_value_arg_decl(self, name, shape, dtype, is_written):
-        result = super(ISPCTarget, self).get_value_arg_decl(
+        result = super(ISPCASTBuilder, self).get_value_arg_decl(
                 name, shape, dtype, is_written)
 
         from cgen import Reference, Const
@@ -334,7 +337,7 @@ class ISPCTarget(CTarget):
         if was_const:
             result = result.subdecl
 
-        if self.occa_mode:
+        if self.target.occa_mode:
             result = Reference(result)
 
         if was_const:
@@ -349,13 +352,12 @@ class ISPCTarget(CTarget):
 
         from loopy.symbolic import aff_to_expr
 
-        from loopy.codegen import wrap_in
         from pymbolic.mapper.stringifier import PREC_NONE
         from cgen import For
 
-        return wrap_in(For,
+        return For(
                 "uniform %s %s = %s"
-                % (self.dtype_to_typename(iname_dtype),
+                % (self.target.dtype_to_typename(iname_dtype),
                     iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                 "%s <= %s" % (
                     iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
@@ -363,6 +365,7 @@ class ISPCTarget(CTarget):
                 inner)
     # }}}
 
+
 # TODO: Generate launch code
 # TODO: Vector types (element access: done)
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index e839801fdfebaa40f6431db947200c32f880ee53..3f7b199ecb0b9e4c2653753ab29baff4f7ad4c46 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -26,8 +26,8 @@ THE SOFTWARE.
 
 import numpy as np
 
-from loopy.target.c import CTarget
-from loopy.target.c.codegen.expression import LoopyCCodeMapper
+from loopy.target.c import CTarget, CASTBuilder
+from loopy.target.c.codegen.expression import ExpressionToCMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
@@ -272,12 +272,21 @@ def opencl_preamble_generator(preamble_info):
             #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
             """)
 
+    from loopy.tools import remove_common_indentation
+    kernel = preamble_info.kernel
+    yield ("00_declare_gid_lid",
+            remove_common_indentation("""
+                #define lid(N) ((%(idx_ctype)s) get_local_id(N))
+                #define gid(N) ((%(idx_ctype)s) get_group_id(N))
+                """ % dict(idx_ctype=kernel.target.dtype_to_typename(
+                    kernel.index_dtype))))
+
 # }}}
 
 
 # {{{ expression mapper
 
-class LoopyOpenCLCCodeMapper(LoopyCCodeMapper):
+class ExpressionToOpenCLCMapper(ExpressionToCMapper):
     def map_group_hw_index(self, expr, enclosing_prec, type_context):
         return "gid(%d)" % expr.axis
 
@@ -307,29 +316,8 @@ class OpenCLTarget(CTarget):
 
         self.atomics_flavor = atomics_flavor
 
-    # {{{ library
-
-    def function_manglers(self):
-        return (
-                super(OpenCLTarget, self).function_manglers() + [
-                    opencl_function_mangler
-                    ])
-
-    def symbol_manglers(self):
-        return (
-                super(OpenCLTarget, self).symbol_manglers() + [
-                    opencl_symbol_mangler
-                    ])
-
-    def preamble_generators(self):
-        from loopy.library.reduction import reduction_preamble_generator
-        return (
-                super(OpenCLTarget, self).preamble_generators() + [
-                    opencl_preamble_generator,
-                    reduction_preamble_generator
-                    ])
-
-    # }}}
+    def get_device_ast_builder(self):
+        return OpenCLCASTBuilder(self)
 
     @memoize_method
     def get_dtype_registry(self):
@@ -359,13 +347,50 @@ class OpenCLTarget(CTarget):
 
     # }}}
 
+# }}}
+
+
+# {{{ ast builder
+
+class OpenCLCASTBuilder(CASTBuilder):
+    # {{{ library
+
+    def function_manglers(self):
+        return (
+                super(OpenCLCASTBuilder, self).function_manglers() + [
+                    opencl_function_mangler
+                    ])
+
+    def symbol_manglers(self):
+        return (
+                super(OpenCLCASTBuilder, self).symbol_manglers() + [
+                    opencl_symbol_mangler
+                    ])
+
+    def preamble_generators(self):
+        from loopy.library.reduction import reduction_preamble_generator
+        return (
+                super(OpenCLCASTBuilder, self).preamble_generators() + [
+                    opencl_preamble_generator,
+                    reduction_preamble_generator
+                    ])
+
+    # }}}
+
     # {{{ top-level codegen
 
-    def wrap_function_declaration(self, kernel, fdecl):
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(
+                codegen_state, codegen_result, schedule_index)
+
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
         fdecl = CLKernel(fdecl)
 
-        _, local_sizes = kernel.get_grid_sizes_as_exprs()
+        from loopy.schedule import get_insn_ids_for_block_at
+        _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
+                get_insn_ids_for_block_at(
+                    codegen_state.kernel.schedule, schedule_index))
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_sizes):
@@ -376,25 +401,9 @@ class OpenCLTarget(CTarget):
 
         return fdecl
 
-    def generate_code(self, kernel, codegen_state, impl_arg_info):
-        code, implemented_domains = (
-                super(OpenCLTarget, self).generate_code(
-                    kernel, codegen_state, impl_arg_info))
-
-        from loopy.tools import remove_common_indentation
-        code = (
-                remove_common_indentation("""
-                    #define lid(N) ((%(idx_ctype)s) get_local_id(N))
-                    #define gid(N) ((%(idx_ctype)s) get_group_id(N))
-                    """ % dict(idx_ctype=self.dtype_to_typename(kernel.index_dtype)))
-                + "\n\n"
-                + code)
-
-        return code, implemented_domains
-
     def generate_body(self, kernel, codegen_state):
         body, implemented_domains = (
-                super(OpenCLTarget, self).generate_body(kernel, codegen_state))
+                super(OpenCLCASTBuilder, self).generate_body(kernel, codegen_state))
 
         from loopy.kernel.data import ImageArg
 
@@ -412,7 +421,7 @@ class OpenCLTarget(CTarget):
     # {{{ code generation guts
 
     def get_expression_to_code_mapper(self, codegen_state):
-        return LoopyOpenCLCCodeMapper(codegen_state)
+        return ExpressionToOpenCLCMapper(codegen_state)
 
     def add_vector_access(self, access_str, index):
         # The 'int' avoids an 'L' suffix for long ints.
@@ -427,11 +436,8 @@ class OpenCLTarget(CTarget):
             if comment:
                 comment = " /* %s */" % comment
 
-            from loopy.codegen import GeneratedInstruction
             from cgen import Statement
-            return GeneratedInstruction(
-                    ast=Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment),
-                    implemented_domain=None)
+            return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment)
         elif kind == "global":
             raise LoopyError("OpenCL does not have global barriers")
         else:
@@ -450,7 +456,7 @@ class OpenCLTarget(CTarget):
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from cgen.opencl import CLGlobal
 
-        return CLGlobal(super(OpenCLTarget, self).get_global_arg_decl(
+        return CLGlobal(super(OpenCLCASTBuilder, self).get_global_arg_decl(
             name, shape, dtype, is_written))
 
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
@@ -486,7 +492,7 @@ class OpenCLTarget(CTarget):
         if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                 np.int32, np.int64, np.float32, np.float64]:
             from cgen import Block, DoWhile, Assign
-            from loopy.codegen import POD
+            from loopy.target.c import POD
             old_val_var = codegen_state.var_name_generator("loopy_old_val")
             new_val_var = codegen_state.var_name_generator("loopy_new_val")
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 44b057f1a713a09bd2f493c8f3be67e790049e6e..806c5c26334f82ee8a57c40ffa7674e082600fab 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -2,6 +2,8 @@
 
 from __future__ import division, absolute_import
 
+import sys
+
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -30,8 +32,11 @@ from six.moves import range
 import numpy as np
 
 from loopy.kernel.data import CallMangleInfo
-from loopy.target.opencl import OpenCLTarget
+from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder
+from loopy.target.python import PythonASTBuilderBase
 from loopy.types import NumpyType
+from loopy.diagnostic import LoopyError
+from warnings import warn
 
 import logging
 logger = logging.getLogger(__name__)
@@ -139,7 +144,7 @@ def check_sizes(kernel, device):
         if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
             parameters[arg.name] = arg.approximately
 
-    glens, llens = kernel.get_grid_sizes_as_exprs()
+    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()
 
     if (max(len(glens), len(llens))
             > device.max_work_item_dimensions):
@@ -264,31 +269,48 @@ class PyOpenCLTarget(OpenCLTarget):
     warnings) and support for complex numbers.
     """
 
-    def __init__(self, device=None):
+    def __init__(self, device=None, pyopencl_module_name="_lpy_cl"):
         # This ensures the dtype registry is populated.
         import pyopencl.tools  # noqa
 
         super(PyOpenCLTarget, self).__init__()
 
         self.device = device
+        self.pyopencl_module_name = pyopencl_module_name
 
-    hash_fields = ["device"]
     comparison_fields = ["device"]
 
-    def function_manglers(self):
-        from loopy.library.random123 import random123_function_mangler
-        return (
-                super(PyOpenCLTarget, self).function_manglers() + [
-                    pyopencl_function_mangler,
-                    random123_function_mangler
-                    ])
+    def update_persistent_hash(self, key_hash, key_builder):
+        super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
+        key_builder.rec(key_hash, getattr(self.device, "persistent_unique_id", None))
 
-    def preamble_generators(self):
-        from loopy.library.random123 import random123_preamble_generator
-        return ([
-            pyopencl_preamble_generator,
-            random123_preamble_generator,
-            ] + super(PyOpenCLTarget, self).preamble_generators())
+    def __getstate__(self):
+        dev_id = None
+        if self.device is not None:
+            dev_id = self.device.persistent_unique_id
+
+        return {"device_id": dev_id, "atomics_flavor": self.atomics_flavor}
+
+    def __setstate__(self, state):
+        self.atomics_flavor = state["atomics_flavor"]
+
+        dev_id = state["device_id"]
+        if dev_id is None:
+            self.device = None
+        else:
+            import pyopencl as cl
+            matches = [
+                dev
+                for plat in cl.get_platforms()
+                for dev in plat.get_devices()
+                if dev.persistent_unique_id == dev_id]
+
+            if matches:
+                self.device = matches[0]
+            else:
+                raise LoopyError(
+                        "cannot unpickle device '%s': not found"
+                        % dev_id)
 
     def preprocess(self, kernel):
         return kernel
@@ -296,6 +318,14 @@ class PyOpenCLTarget(OpenCLTarget):
     def pre_codegen_check(self, kernel):
         check_sizes(kernel, self.device)
 
+    def get_host_ast_builder(self):
+        return PyOpenCLPythonASTBuilder(self)
+
+    def get_device_ast_builder(self):
+        return PyOpenCLCASTBuilder(self)
+
+    # {{{ types
+
     def get_dtype_registry(self):
         try:
             from pyopencl.compyte.dtypes import TYPE_REGISTRY
@@ -330,7 +360,294 @@ class PyOpenCLTarget(OpenCLTarget):
 
         return struct.calcsize(fmt)
 
+    # }}}
+
+# }}}
+
+
+# {{{ host code: value arg setup
+
+def generate_value_arg_setup(kernel, devices, implemented_data_info):
+    options = kernel.options
+
+    import loopy as lp
+    from loopy.kernel.array import ArrayBase
+
+    # {{{ arg counting bug handling
+
+    # For example:
+    # https://github.com/pocl/pocl/issues/197
+    # (but Apple CPU has a similar bug)
+
+    work_around_arg_count_bug = False
+    warn_about_arg_count_bug = False
+
+    try:
+        from pyopencl.characterize import has_struct_arg_count_bug
+
+    except ImportError:
+        count_bug_per_dev = [False]*len(devices)
+
+    else:
+        count_bug_per_dev = [
+                has_struct_arg_count_bug(dev)
+                if dev is not None else False
+                for dev in devices]
+
+    if any(dev is None for dev in devices):
+        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
+                "workarounds for broken OpenCL implementations "
+                "(such as those relating to complex numbers) "
+                "may not be enabled when needed"
+                .format(knl_name=kernel.name))
+
+    if any(count_bug_per_dev):
+        if all(count_bug_per_dev):
+            work_around_arg_count_bug = True
+        else:
+            warn_about_arg_count_bug = True
+
+    # }}}
+
+    cl_arg_idx = 0
+    arg_idx_to_cl_arg_idx = {}
+
+    fp_arg_count = 0
+
+    from genpy import (
+            Comment, Line, If, Raise, Assign, Statement as S, Suite)
+
+    result = []
+    gen = result.append
+
+    for arg_idx, idi in enumerate(implemented_data_info):
+        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
+
+        if idi.arg_class is not lp.ValueArg:
+            assert issubclass(idi.arg_class, ArrayBase)
+
+            # assume each of those generates exactly one...
+            cl_arg_idx += 1
+
+            continue
+
+        gen(Comment("{{{ process %s" % idi.name))
+        gen(Line())
+
+        if not options.skip_arg_checks:
+            gen(If("%s is None" % idi.name,
+                Raise('RuntimeError("input argument \'{name}\' '
+                        'must be supplied")'.format(name=idi.name))))
+
+        if sys.version_info < (2, 7) and idi.dtype.is_integral():
+            gen(Comment("cast to long to avoid trouble with struct packing"))
+            gen(Assign(idi.name, "long(%s)" % idi.name))
+            gen(Line())
+
+        if idi.dtype.is_composite():
+            gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
+            cl_arg_idx += 1
+
+        elif idi.dtype.is_complex():
+            assert isinstance(idi.dtype, NumpyType)
+
+            dtype = idi.dtype
+
+            if warn_about_arg_count_bug:
+                warn("{knl_name}: arguments include complex numbers, and "
+                        "some (but not all) of the target devices mishandle "
+                        "struct kernel arguments (hence the workaround is "
+                        "disabled".format(
+                            knl_name=kernel.name))
+
+            if dtype.numpy_dtype == np.complex64:
+                arg_char = "f"
+            elif dtype.numpy_dtype == np.complex128:
+                arg_char = "d"
+            else:
+                raise TypeError("unexpected complex type: %s" % dtype)
+
+            if (work_around_arg_count_bug
+                    and dtype.numpy_dtype == np.complex128
+                    and fp_arg_count + 2 <= 8):
+                gen(Assign(
+                    "_lpy_buf",
+                    "_lpy_pack('{arg_char}', {arg_var}.real)"
+                    .format(arg_char=arg_char, arg_var=idi.name)))
+                gen(S(
+                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
+                    .format(cl_arg_idx=cl_arg_idx)))
+                cl_arg_idx += 1
+
+                gen(Assign(
+                    "_lpy_buf",
+                    "_lpy_pack('{arg_char}', {arg_var}.imag)"
+                    .format(arg_char=arg_char, arg_var=idi.name)))
+                gen(S(
+                        "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
+                        .format(cl_arg_idx=cl_arg_idx)))
+                cl_arg_idx += 1
+            else:
+                gen(Assign(
+                    "_lpy_buf",
+                    "_lpy_pack('{arg_char}{arg_char}', "
+                    "{arg_var}.real, {arg_var}.imag)"
+                    .format(arg_char=arg_char, arg_var=idi.name)))
+                gen(S(
+                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
+                    .format(cl_arg_idx=cl_arg_idx)))
+                cl_arg_idx += 1
+
+            fp_arg_count += 2
+
+        elif isinstance(idi.dtype, NumpyType):
+            if idi.dtype.dtype.kind == "f":
+                fp_arg_count += 1
+
+            gen(S(
+                "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
+                % (cl_arg_idx, idi.dtype.dtype.char, idi.name)))
+
+            cl_arg_idx += 1
+
+        else:
+            raise LoopyError("do not know how to pass argument of type '%s'"
+                    % idi.dtype)
+
+        gen(Line())
+
+        gen(Comment("}}}"))
+        gen(Line())
+
+    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
+
+# }}}
+
+
+def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx):
+    from loopy.kernel.array import ArrayBase
+    from genpy import Statement as S, Suite
+
+    result = []
+    gen = result.append
+
+    for arg_idx, arg in enumerate(implemented_data_info):
+        if not issubclass(arg.arg_class, ArrayBase):
+            continue
+
+        cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx]
+
+        gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, arg.name)))
+
+    return Suite(result)
+
+
+# {{{ host ast builder
+
+class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
+    """A Python host AST builder for integration with PyOpenCL.
+    """
+
+    # {{{ code generation guts
+
+    def get_function_definition(self, codegen_state, codegen_result,
+            schedule_index, function_decl, function_body):
+        args = (
+                ["_lpy_cl_kernels", "queue"]
+                + [idi.name for idi in codegen_state.implemented_data_info]
+                + ["wait_for=None"])
+
+        from genpy import Function, Suite, ImportAs, Return, FromImport, Line
+        return Function(
+                codegen_result.current_program(codegen_state).name,
+                args,
+                Suite([
+                    FromImport("struct", ["pack as _lpy_pack"]),
+                    ImportAs("pyopencl", "_lpy_cl"),
+                    Line(),
+                    function_body,
+                    Return("_lpy_evt"),
+                    ]))
+
+    def get_function_declaration(self, codegen_state, codegen_result,
+            schedule_index):
+        # no such thing in Python
+        return None
+
+    def get_temporary_decls(self, codegen_state):
+        # FIXME: Create global temporaries
+        return []
+
+    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
+        ecm = self.get_expression_to_code_mapper(codegen_state)
+
+        if not gsize:
+            gsize = (1,)
+        if not lsize:
+            lsize = (1,)
+
+        value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \
+                generate_value_arg_setup(codegen_state.kernel, [self.target.device],
+                        codegen_state.implemented_data_info)
+        arry_arg_code = generate_array_arg_setup(
+                codegen_state.kernel,
+                codegen_state.implemented_data_info,
+                arg_idx_to_cl_arg_idx)
+
+        from genpy import Suite, Assign, Assert, Line, Comment
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        # TODO: Generate finer-grained dependency structure
+        return Suite([
+            Comment("{{{ enqueue %s" % name),
+            Line(),
+            Assign("_lpy_knl", "_lpy_cl_kernels."+name),
+            Assert("_lpy_knl.num_args == %d" % cl_arg_count),
+            Line(),
+            value_arg_code,
+            arry_arg_code,
+            Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel("
+                "queue, _lpy_knl, "
+                "%(gsize)s, %(lsize)s,  wait_for=wait_for, g_times_l=True)"
+                % dict(
+                    pyopencl_module_name=self.target.pyopencl_module_name,
+                    gsize=ecm(gsize, prec=PREC_NONE, type_context="i"),
+                    lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))),
+            Assign("wait_for", "[_lpy_evt]"),
+            Line(),
+            Comment("}}}"),
+            Line(),
+            ])
+
+    # }}}
+
 # }}}
 
 
+# {{{ device ast builder
+
+class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
+    """A C device AST builder for integration with PyOpenCL.
+    """
+
+    # {{{ library
+
+    def function_manglers(self):
+        from loopy.library.random123 import random123_function_mangler
+        return (
+                super(PyOpenCLCASTBuilder, self).function_manglers() + [
+                    pyopencl_function_mangler,
+                    random123_function_mangler
+                    ])
+
+    def preamble_generators(self):
+        from loopy.library.random123 import random123_preamble_generator
+        return ([
+            pyopencl_preamble_generator,
+            random123_preamble_generator,
+            ] + super(PyOpenCLCASTBuilder, self).preamble_generators())
+
+    # }}}
+
+# }}}
 # vim: foldmethod=marker
diff --git a/loopy/target/python.py b/loopy/target/python.py
new file mode 100644
index 0000000000000000000000000000000000000000..83e8df12459dc0b79d0789341dfc1213c008e084
--- /dev/null
+++ b/loopy/target/python.py
@@ -0,0 +1,128 @@
+"""Python host AST builder for integration with PyOpenCL."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from pymbolic.mapper.stringifier import StringifyMapper
+from loopy.expression import TypeInferenceMapper
+from loopy.kernel.data import ValueArg
+from loopy.diagnostic import LoopyError
+from loopy.target import ASTBuilderBase
+
+
+# {{{ expression to code
+
+class ExpressionToPythonMapper(StringifyMapper):
+    def __init__(self, codegen_state, type_inf_mapper=None):
+        self.kernel = codegen_state.kernel
+        self.codegen_state = codegen_state
+
+        if type_inf_mapper is None:
+            type_inf_mapper = TypeInferenceMapper(self.kernel)
+        self.type_inf_mapper = type_inf_mapper
+
+    def rec(self, expr, prec, type_context=None, needed_dtype=None):
+        return super(ExpressionToPythonMapper, self).rec(expr, prec)
+
+    __call__ = rec
+
+    def map_constant(self, expr, enclosing_prec):
+        return repr(expr)
+
+    def map_variable(self, expr, enclosing_prec):
+        if expr.name in self.kernel.all_inames():
+            return super(ExpressionToPythonMapper, self).map_variable(
+                    expr, enclosing_prec)
+
+        var_descr = self.kernel.get_var_descriptor(expr.name)
+        if isinstance(var_descr, ValueArg):
+            return super(ExpressionToPythonMapper, self).map_variable(
+                    expr, enclosing_prec)
+
+        raise LoopyError("may not refer to %s '%s' in host code"
+                % (type(var_descr).__name__, expr.name))
+
+    def map_subscript(self, expr, enclosing_prec):
+        raise LoopyError("may not subscript '%s' in host code"
+                % expr.aggregate.name)
+
+# }}}
+
+
+# {{{ ast builder
+
+class PythonASTBuilderBase(ASTBuilderBase):
+    """A Python host AST builder for integration with PyOpenCL.
+    """
+
+    # {{{ code generation guts
+
+    def get_expression_to_code_mapper(self, codegen_state):
+        return ExpressionToPythonMapper(codegen_state)
+
+    @property
+    def ast_block_class(self):
+        from genpy import Suite
+        return Suite
+
+    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
+            static_lbound, static_ubound, inner):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from loopy.symbolic import aff_to_expr
+
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from genpy import For
+
+        return For(
+                (iname,),
+                "range(%s, %s + 1)"
+                % (
+                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i"),
+                    ecm(aff_to_expr(static_ubound), PREC_NONE, "i"),
+                    ),
+                inner)
+
+    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
+        from genpy import Assign
+        return Assign(name, val_str)
+
+    def emit_blank_line(self):
+        from genpy import Line
+        return Line()
+
+    def emit_comment(self, s):
+        from genpy import Comment
+        return Comment(s)
+
+    def emit_if(self, condition_str, ast):
+        from genpy import If
+        return If(condition_str, ast)
+
+    # }}}
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/types.py b/loopy/types.py
index d80650eb6b203c70b2abf60481ebb9e38ca99b9f..b897d9f700b198e73d95a09c7d459ed2d7f877b1 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -116,10 +116,6 @@ class NumpyType(LoopyType):
         self.dtype = self.target.get_or_register_dtype([name], NumpyType(dtype))
 
     def with_target(self, target):
-        if (self.target is not None
-                and target is not self.target):
-            raise RuntimeError("target already set to different value")
-
         return type(self)(self.dtype, target)
 
     def assert_has_target(self):
diff --git a/loopy/version.py b/loopy/version.py
index 7716feea32079bb122557223b2a1e970ff630ecc..b1b7927babd7a33aed91eb390c4ca2268ae0f204 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v26-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v27-islpy%s" % _islpy_version
diff --git a/requirements.txt b/requirements.txt
index 5dbc4b050f4dd688e038b994281e08e73ab3133b..c4dbe7a6de33a8a0b801bb94fcc8b506b790091c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
-git+git://github.com/inducer/pytools
-git+git://github.com/inducer/islpy
-git+git://github.com/inducer/cgen
-git+git://github.com/pyopencl/pyopencl
-git+git://github.com/inducer/pymbolic
+git+https://github.com/inducer/pytools.git
+git+https://github.com/inducer/islpy.git
+git+https://github.com/inducer/cgen.git
+git+https://github.com/pyopencl/pyopencl.git
+git+https://github.com/inducer/pymbolic.git
+git+https://github.com/inducer/genpy.git
 
 hg+https://bitbucket.org/inducer/f2py
 
diff --git a/setup.py b/setup.py
index 30d6dfb636c4d5cea181da4a6b7e6df514adca9d..bd24612e93cef3b9e25748b4d157d0df4b7209d5 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ setup(name="loo.py",
       install_requires=[
           "pytools>=2016.1",
           "pymbolic>=2016.2",
+          "genpy>=2016.1",
           "cgen>=2016.1",
           "islpy>=2016.1.2",
           "six>=1.8.0",
diff --git a/test/test_loopy.py b/test/test_loopy.py
index a61b6563ff254fdf6eaf6279158a97d20045a663..588cc9a2015a983d86803848bb5e4d84ec6ae678 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1740,18 +1740,24 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory):
                 outer_tag=outer_tag)
         knl = lp.set_loop_priority(knl, "i_outer")
 
-        a = cl.clrandom.rand(queue, 20, np.float32)
+        a = cl.array.empty(queue, 20, np.float32)
+        a.fill(17)
         a_ref = a.copy()
         a_knl = a.copy()
 
-        knl = lp.set_options(knl, "write_cl")
+        knl = lp.set_options(knl, write_cl=True)
+        print("TEST-----------------------------------------")
         knl(queue, a=a_knl)
+        print("REF-----------------------------------------")
         ref_knl(queue, a=a_ref)
+        print("DONE-----------------------------------------")
 
-        queue.finish()
-
+        print("REF", a_ref)
+        print("KNL", a_knl)
         assert (a_ref == a_knl).get().all()
 
+        print("_________________________________")
+
 
 def test_multiple_writes_to_local_temporary():
     # Loopy would previously only handle barrier insertion correctly if exactly
@@ -2391,10 +2397,12 @@ def test_ispc_target(occa_mode=False):
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
 
-    print(
-            lp.generate_code(
+    codegen_result = lp.generate_code_v2(
                 lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))[0])
+                    lp.preprocess_kernel(knl)))
+
+    print(codegen_result.device_code())
+    print(codegen_result.host_code())
 
 
 def test_cuda_target():
@@ -2556,6 +2564,81 @@ def test_random123(ctx_factory, tp):
     assert (0 <= out).all()
 
 
+def test_kernel_splitting(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            c[i] = a[i + 1]
+            out[i] = c[i]
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl,
+            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    # schedule
+    from loopy.preprocess import preprocess_kernel
+    knl = preprocess_kernel(knl)
+
+    from loopy.schedule import get_one_scheduled_kernel
+    knl = get_one_scheduled_kernel(knl)
+
+    # map schedule onto host or device
+    print(knl)
+
+    cgr = lp.generate_code_v2(knl)
+
+    assert len(cgr.device_programs) == 2
+
+    print(cgr.device_code())
+    print(cgr.host_code())
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
+def test_kernel_splitting_with_loop(ctx_factory):
+    #ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i,k]: 0<=i<n and 0<=k<3 }",
+            """
+            c[k,i] = a[k, i + 1]
+            out[k,i] = c[k,i]
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl,
+            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    # schedule
+    from loopy.preprocess import preprocess_kernel
+    knl = preprocess_kernel(knl)
+
+    from loopy.schedule import get_one_scheduled_kernel
+    knl = get_one_scheduled_kernel(knl)
+
+    # map schedule onto host or device
+    print(knl)
+
+    cgr = lp.generate_code_v2(knl)
+
+    assert len(cgr.device_programs) == 2
+
+    print(cgr.device_code())
+    print(cgr.host_code())
+
+    # Doesn't yet work--not passing k
+    #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])