diff --git a/.gitmodules b/.gitmodules
index 504e23cf344e2d5ae35f6f6abe97458b8c7a39b8..41cf31d9cfcf85de99569ea131c3f367e68a3436 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "loopy/target/opencl/compyte"]
-	path = loopy/target/opencl/compyte
+[submodule "loopy/target/c/compyte"]
+	path = loopy/target/c/compyte
 	url = https://github.com/inducer/compyte
diff --git a/bin/loopy b/bin/loopy
index ed8f0ff16e4e82b02b09da126660db1baaed4035..2a657174d4e390f80331689068b8d21e8eab0de7 100644
--- a/bin/loopy
+++ b/bin/loopy
@@ -116,7 +116,7 @@ def main():
 
         kernels = [kernel]
 
-    elif args.lang in ["fortran", "floopy"]:
+    elif args.lang in ["fortran", "floopy", "fpp"]:
         pre_transform_code = None
         if args.transform:
             with open(args.transform, "r") as xform_fd:
@@ -132,7 +132,8 @@ def main():
                         + pre_transform_code)
 
         from loopy.frontend.fortran import f2loopy
-        kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code)
+        kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code,
+                use_c_preprocessor=(args.lang == "fpp"))
 
         if args.name is not None:
             kernels = [kernel for kernel in kernels
diff --git a/examples/fortran/run-floopy.sh b/examples/fortran/run-floopy.sh
index 6a718bb69ba9db8ec2a8f5cbf3b1314676d2f6ab..fcea2c8b5ed58eed8738ad263df62cdf687b3d0f 100755
--- a/examples/fortran/run-floopy.sh
+++ b/examples/fortran/run-floopy.sh
@@ -3,4 +3,4 @@
 NAME="$1"
 shift
 
-python $(which loopy) --lang=floopy "$NAME" - "$@"
+python $(which loopy) --lang=fpp "$NAME" - "$@"
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 3bc1d587c897d41b27c1f56e5a938881f58b0f03..12329bbf6472867c873958f1a18f20ba1016b5f7 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -31,7 +31,8 @@ import islpy as isl
 from islpy import dim_type
 
 from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
-        SubstitutionRuleMappingContext)
+        SubstitutionRuleMappingContext,
+        TaggedVariable, Reduction, LinearSubscript, )
 from loopy.diagnostic import LoopyError
 
 
@@ -62,12 +63,14 @@ from loopy.padding import (split_arg_axis, find_padding_multiple,
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.codegen import generate_code
+from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
 from loopy.auto_test import auto_test_vs_ref
 
 __all__ = [
+        "TaggedVariable", "Reduction", "LinearSubscript",
+
         "auto",
 
         "LoopKernel",
@@ -94,7 +97,7 @@ __all__ = [
 
         "preprocess_kernel", "realize_reduction", "infer_unknown_types",
         "generate_loop_schedules", "get_one_scheduled_kernel",
-        "generate_code",
+        "generate_code", "generate_body",
 
         "CompiledKernel",
 
@@ -555,9 +558,10 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
     # {{{ normalize arguments, find unique new_inames
 
     if isinstance(inames, str):
-        inames = inames.split(",")
+        inames = [iname.strip() for iname in inames.split(",")]
+
     if isinstance(new_inames, str):
-        new_inames = new_inames.split(",")
+        new_inames = [iname.strip() for iname in new_inames.split(",")]
 
     from loopy.context_matching import parse_stack_match
     within = parse_stack_match(within)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index c14c1ffa24157a51c58b31a166913e94a558e062..26f225c6293e62ef43adc921bfef20a3a2a534b1 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
 
 from loopy.diagnostic import LoopyError
 from pytools import Record
@@ -146,6 +145,26 @@ def add_comment(cmt, code):
 # }}}
 
 
+class SeenFunction(Record):
+    """
+    .. attribute:: name
+    .. attribute:: c_name
+    .. attribute:: arg_dtypes
+
+        a tuple of arg dtypes
+    """
+
+    def __init__(self, name, c_name, arg_dtypes):
+        Record.__init__(self,
+                name=name,
+                c_name=c_name,
+                arg_dtypes=arg_dtypes)
+
+    def __hash__(self):
+        return hash((type(self),)
+                + tuple((f, getattr(self, f)) for f in type(self).fields))
+
+
 # {{{ code generation state
 
 class CodeGenerationState(object):
@@ -160,23 +179,26 @@ class CodeGenerationState(object):
         A :class:`frozenset` of predicates for which checks have been
         implemented.
 
-    .. attribute:: c_code_mapper
+    .. attribute:: expression_to_code_mapper
 
         A :class:`loopy.codegen.expression.CCodeMapper` that does not take
         per-ILP assignments into account.
     """
-    def __init__(self, implemented_domain, implemented_predicates, c_code_mapper):
+    def __init__(self, implemented_domain, implemented_predicates,
+            expression_to_code_mapper):
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
-        self.c_code_mapper = c_code_mapper
+        self.expression_to_code_mapper = expression_to_code_mapper
 
     def copy(self, implemented_domain=None, implemented_predicates=frozenset(),
-            c_code_mapper=None):
+            expression_to_code_mapper=None):
         return CodeGenerationState(
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
                     implemented_predicates or self.implemented_predicates),
-                c_code_mapper=c_code_mapper or self.c_code_mapper)
+                expression_to_code_mapper=(
+                    expression_to_code_mapper
+                    or self.expression_to_code_mapper))
 
     def intersect(self, other):
         new_impl, new_other = isl.align_two(self.implemented_domain, other)
@@ -205,7 +227,8 @@ class CodeGenerationState(object):
         new_impl_domain = new_impl_domain.add_constraint(cns)
         return self.copy(
                 implemented_domain=new_impl_domain,
-                c_code_mapper=self.c_code_mapper.copy_and_assign(iname, expr))
+                expression_to_code_mapper=(
+                    self.expression_to_code_mapper.copy_and_assign(iname, expr)))
 
 # }}}
 
@@ -367,34 +390,13 @@ def generate_code(kernel, device=None):
     from loopy.check import pre_codegen_checks
     pre_codegen_checks(kernel)
 
-    from cgen import (FunctionBody, FunctionDeclaration,
-            Value, Module, Block,
-            Line, Const, LiteralLines, Initializer)
-
     logger.info("%s: generate code: start" % kernel.name)
 
-    from cgen.opencl import (CLKernel, CLRequiredWorkGroupSize)
-
-    allow_complex = False
-    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
-        if var.dtype.kind == "c":
-            allow_complex = True
-
-    seen_dtypes = set()
-    seen_functions = set()
-
-    from loopy.codegen.expression import LoopyCCodeMapper
-    ccm = (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions,
-        allow_complex=allow_complex))
-
-    mod = []
-
-    body = Block()
-
     # {{{ examine arg list
 
-    from loopy.kernel.data import ImageArg, ValueArg
+    from loopy.kernel.data import ValueArg
     from loopy.kernel.array import ArrayBase
+    from cgen import Const
 
     impl_arg_info = []
 
@@ -417,55 +419,29 @@ def generate_code(kernel, device=None):
         else:
             raise ValueError("argument type not understood: '%s'" % type(arg))
 
-    if any(isinstance(arg, ImageArg) for arg in kernel.args):
-        body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")),
-            "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
-                "| CLK_FILTER_NEAREST"))
+    allow_complex = False
+    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
+        if var.dtype.kind == "c":
+            allow_complex = True
 
     # }}}
 
-    mod.extend([
-        LiteralLines(r"""
-        #define lid(N) ((%(idx_ctype)s) get_local_id(N))
-        #define gid(N) ((%(idx_ctype)s) get_group_id(N))
-        """ % dict(idx_ctype=kernel.target.dtype_to_typename(kernel.index_dtype))),
-        Line()])
-
-    # {{{ build lmem array declarators for temporary variables
-
-    body.extend(
-            idi.cgen_declarator
-            for tv in six.itervalues(kernel.temporary_variables)
-            for idi in tv.decl_info(
-                kernel.target,
-                is_written=True, index_dtype=kernel.index_dtype))
-
-    # }}}
+    seen_dtypes = set()
+    seen_functions = set()
 
     initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
     codegen_state = CodeGenerationState(
             implemented_domain=initial_implemented_domain,
             implemented_predicates=frozenset(),
-            c_code_mapper=ccm)
-
-    from loopy.codegen.loop import set_up_hw_parallel_loops
-    gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)
+            expression_to_code_mapper=kernel.target.get_expression_to_code_mapper(
+                kernel, seen_dtypes, seen_functions, allow_complex))
 
-    body.append(Line())
+    code_str, implemented_domains = kernel.target.generate_code(
+            kernel, codegen_state, impl_arg_info)
 
-    if isinstance(gen_code.ast, Block):
-        body.extend(gen_code.ast.contents)
-    else:
-        body.append(gen_code.ast)
-
-    mod.append(
-        FunctionBody(
-            CLRequiredWorkGroupSize(
-                kernel.get_grid_sizes_as_exprs()[1],
-                CLKernel(FunctionDeclaration(
-                    Value("void", kernel.name),
-                    [iai.cgen_declarator for iai in impl_arg_info]))),
-            body))
+    from loopy.check import check_implemented_domains
+    assert check_implemented_domains(kernel, implemented_domains,
+            code_str)
 
     # {{{ handle preambles
 
@@ -491,20 +467,18 @@ def generate_code(kernel, device=None):
         seen_preamble_tags.add(tag)
         dedup_preambles.append(preamble)
 
-    mod = ([LiteralLines(lines) for lines in dedup_preambles]
-            + [Line()] + mod)
-
-    # }}}
+    from loopy.tools import remove_common_indentation
+    preamble_codes = [
+            remove_common_indentation(lines) + "\n"
+            for lines in dedup_preambles]
 
-    result = str(Module(mod))
+    code_str = "".join(preamble_codes) + code_str
 
-    from loopy.check import check_implemented_domains
-    assert check_implemented_domains(kernel, gen_code.implemented_domains,
-            result)
+    # }}}
 
     logger.info("%s: generate code: done" % kernel.name)
 
-    result = result, impl_arg_info
+    result = code_str, impl_arg_info
 
     if CACHING_ENABLED:
         code_gen_cache[input_kernel] = result
@@ -514,4 +488,51 @@ def generate_code(kernel, device=None):
 # }}}
 
 
+# {{{ generate function body
+
+def generate_body(kernel):
+    if kernel.schedule is None:
+        from loopy.schedule import get_one_scheduled_kernel
+        kernel = get_one_scheduled_kernel(kernel)
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.SCHEDULED:
+        raise LoopyError("cannot generate code for a kernel that has not been "
+                "scheduled")
+
+    from loopy.preprocess import infer_unknown_types
+    kernel = infer_unknown_types(kernel, expect_completion=True)
+
+    from loopy.check import pre_codegen_checks
+    pre_codegen_checks(kernel)
+
+    logger.info("%s: generate code: start" % kernel.name)
+
+    allow_complex = False
+    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
+        if var.dtype.kind == "c":
+            allow_complex = True
+
+    seen_dtypes = set()
+    seen_functions = set()
+
+    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
+    codegen_state = CodeGenerationState(
+            implemented_domain=initial_implemented_domain,
+            implemented_predicates=frozenset(),
+            expression_to_code_mapper=kernel.target.get_expression_to_code_mapper(
+                kernel, seen_dtypes, seen_functions, allow_complex))
+
+    code_str, implemented_domains = kernel.target.generate_body(
+            kernel, codegen_state)
+
+    from loopy.check import check_implemented_domains
+    assert check_implemented_domains(kernel, implemented_domains,
+            code_str)
+
+    logger.info("%s: generate code: done" % kernel.name)
+
+    return code_str
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index c9e79f6bcc17f3ed3166dfc18cc012721410af6c..19ac4106ba58821e5d4bf5231eb530977739c7f3 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -28,14 +28,14 @@ from islpy import dim_type
 from pymbolic.mapper.stringifier import PREC_NONE
 
 
-def constraint_to_code(ccm, cns):
+def constraint_to_code(ecm, cns):
     if cns.is_equality():
         comp_op = "=="
     else:
         comp_op = ">="
 
     from loopy.symbolic import constraint_to_expr
-    return "%s %s 0" % (ccm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op)
+    return "%s %s 0" % (ecm(constraint_to_expr(cns), PREC_NONE, "i"), comp_op)
 
 
 # {{{ bounds check generator
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 3927293ae82dc521327a8c17afae5e93be1e3aa9..b28244a8c01e72611244f33901793c37804b00ad 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -77,7 +77,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
 
     elif isinstance(sched_item, Barrier):
         from loopy.codegen import GeneratedInstruction
-        from cgen import Statement as S
+        from cgen import Statement as S  # noqa
 
         if sched_item.comment:
             comment = " /* %s */" % sched_item.comment
@@ -362,7 +362,8 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                 from loopy.codegen.bounds import constraint_to_code
 
                 conditionals = [
-                        constraint_to_code(codegen_state.c_code_mapper, cns)
+                        constraint_to_code(
+                            codegen_state.expression_to_code_mapper, cns)
                         for cns in bounds_checks] + list(pred_checks)
 
                 result = [wrap_in_if(conditionals, gen_code_block(result))]
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 1bd977f3ea2e8e15e63424e89b1e347063da70e2..b98716a4e6281ff559abc670a8e1b0fe5d67976d 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -45,7 +45,9 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st
     if bounds_check_set.is_empty():
         return None, None
 
-    condition_codelets = [constraint_to_code(codegen_state.c_code_mapper, cns)
+    condition_codelets = [
+            constraint_to_code(
+                codegen_state.expression_to_code_mapper, cns)
             for cns in bounds_checks]
 
     condition_codelets.extend(
@@ -86,7 +88,7 @@ def generate_instruction_code(kernel, insn, codegen_state):
 
 
 def generate_expr_instruction_code(kernel, insn, codegen_state):
-    ccm = codegen_state.c_code_mapper
+    ecm = codegen_state.expression_to_code_mapper
 
     expr = insn.expression
 
@@ -94,16 +96,16 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     target_dtype = kernel.get_var_descriptor(assignee_var_name).dtype
 
     from cgen import Assign
-    from loopy.codegen.expression import dtype_to_type_context
-    lhs_code = ccm(insn.assignee, prec=PREC_NONE, type_context=None)
+    from loopy.expression import dtype_to_type_context
+    lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
     result = Assign(
             lhs_code,
-            ccm(expr, prec=PREC_NONE,
+            ecm(expr, prec=PREC_NONE,
                 type_context=dtype_to_type_context(kernel.target, target_dtype),
                 needed_dtype=target_dtype))
 
     if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
-        from cgen import Statement as S
+        from cgen import Statement as S  # noqa
 
         gs, ls = kernel.get_grid_sizes()
 
@@ -123,7 +125,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
         if assignee_indices:
             printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"])
             printf_args.extend(
-                    ccm(i, prec=PREC_NONE, type_context="i")
+                    ecm(i, prec=PREC_NONE, type_context="i")
                     for i in assignee_indices)
 
         if kernel.options.trace_assignment_values:
@@ -158,7 +160,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
 
 
 def generate_c_instruction_code(kernel, insn, codegen_state):
-    ccm = codegen_state.c_code_mapper
+    ecm = codegen_state.expression_to_code_mapper
 
     body = []
 
@@ -168,14 +170,14 @@ def generate_c_instruction_code(kernel, insn, codegen_state):
     from pymbolic.primitives import Variable
     for name, iname_expr in insn.iname_exprs:
         if (isinstance(iname_expr, Variable)
-                and name not in ccm.var_subst_map):
+                and name not in ecm.var_subst_map):
             # No need, the bare symbol will work
             continue
 
         body.append(
                 Initializer(
                     POD(kernel.target, kernel.index_dtype, name),
-                    codegen_state.c_code_mapper(
+                    codegen_state.expression_to_code_mapper(
                         iname_expr, prec=PREC_NONE, type_context="i")))
 
     if body:
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 40f433a90f90855aa554261b8d6fb5cbdd5fb0ec..dcea94690c2cca630b40a870f777d884715f1dd4 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -250,8 +250,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
         # slabbing conditionals.
         slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
         new_codegen_state = codegen_state.copy(
-                c_code_mapper=codegen_state.c_code_mapper.copy_and_assign(
-                    iname, hw_axis_expr))
+                expression_to_code_mapper=(
+                    codegen_state.expression_to_code_mapper.copy_and_assign(
+                        iname, hw_axis_expr)))
 
         inner = set_up_hw_parallel_loops(
                 slabbed_kernel, sched_index,
@@ -268,7 +269,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
 # {{{ sequential loop
 
 def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
-    ccm = codegen_state.c_code_mapper
+    ecm = codegen_state.expression_to_code_mapper
     loop_iname = kernel.schedule[sched_index].iname
 
     slabs = get_slab_decomposition(
@@ -319,11 +320,11 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
         static_lbound = static_min_of_pw_aff(
                 kernel.cache_manager.dim_min(
                     dom_and_slab, loop_iname_idx).coalesce(),
-                constants_only=False)
+                constants_only=False, prefer_constants=False)
         static_ubound = static_max_of_pw_aff(
                 kernel.cache_manager.dim_max(
                     dom_and_slab, loop_iname_idx).coalesce(),
-                constants_only=False)
+                constants_only=False, prefer_constants=False)
 
         # }}}
 
@@ -362,7 +363,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
             # single-trip, generate just a variable assignment, not a loop
             result.append(gen_code_block([
                 Initializer(Const(POD(kernel.index_dtype, loop_iname)),
-                    ccm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                 Line(),
                 inner,
                 ]))
@@ -373,9 +374,9 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
             result.append(wrap_in(For,
                     "%s %s = %s"
                     % (kernel.target.dtype_to_typename(kernel.index_dtype),
-                        loop_iname, ccm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                        loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                     "%s <= %s" % (
-                        loop_iname, ccm(aff_to_expr(static_ubound), PREC_NONE, "i")),
+                        loop_iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
                     "++%s" % loop_iname,
                     inner))
 
diff --git a/loopy/expression.py b/loopy/expression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2afb803b97b0e5b9f9ff1510da92ad10a50711dc
--- /dev/null
+++ b/loopy/expression.py
@@ -0,0 +1,257 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+
+from pymbolic.mapper import CombineMapper
+
+from loopy.tools import is_integer
+from loopy.diagnostic import TypeInferenceFailure, DependencyTypeInferenceFailure
+
+
+# type_context may be:
+# - 'i' for integer -
+# - 'f' for single-precision floating point
+# - 'd' for double-precision floating point
+# or None for 'no known context'.
+
+def dtype_to_type_context(target, dtype):
+    dtype = np.dtype(dtype)
+
+    if dtype.kind == 'i':
+        return 'i'
+    if dtype in [np.float64, np.complex128]:
+        return 'd'
+    if dtype in [np.float32, np.complex64]:
+        return 'f'
+    if target.is_vector_dtype(dtype):
+        return dtype_to_type_context(target, dtype.fields["x"][0])
+
+    return None
+
+
+# {{{ type inference
+
+class TypeInferenceMapper(CombineMapper):
+    def __init__(self, kernel, new_assignments=None):
+        """
+        :arg new_assignments: mapping from names to either
+            :class:`loopy.kernel.data.TemporaryVariable`
+            or
+            :class:`loopy.kernel.data.KernelArgument`
+            instances
+        """
+        self.kernel = kernel
+        if new_assignments is None:
+            new_assignments = {}
+        self.new_assignments = new_assignments
+
+    # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
+    # are Python-equal (for many common constants such as integers).
+
+    @staticmethod
+    def combine(dtypes):
+        dtypes = list(dtypes)
+
+        result = dtypes.pop()
+        while dtypes:
+            other = dtypes.pop()
+
+            if result.isbuiltin and other.isbuiltin:
+                if (result, other) in [
+                        (np.int32, np.float32), (np.int32, np.float32)]:
+                    # numpy makes this a double. I disagree.
+                    result = np.dtype(np.float32)
+                else:
+                    result = (
+                            np.empty(0, dtype=result)
+                            + np.empty(0, dtype=other)
+                            ).dtype
+            elif result.isbuiltin and not other.isbuiltin:
+                # assume the non-native type takes over
+                result = other
+            elif not result.isbuiltin and other.isbuiltin:
+                # assume the non-native type takes over
+                pass
+            else:
+                if result is not other:
+                    raise TypeInferenceFailure(
+                            "nothing known about result of operation on "
+                            "'%s' and '%s'" % (result, other))
+
+        return result
+
+    def map_sum(self, expr):
+        dtypes = []
+        small_integer_dtypes = []
+        for child in expr.children:
+            dtype = self.rec(child)
+            if is_integer(child) and abs(child) < 1024:
+                small_integer_dtypes.append(dtype)
+            else:
+                dtypes.append(dtype)
+
+        from pytools import all
+        if all(dtype.kind == "i" for dtype in dtypes):
+            dtypes.extend(small_integer_dtypes)
+
+        return self.combine(dtypes)
+
+    map_product = map_sum
+
+    def map_quotient(self, expr):
+        n_dtype = self.rec(expr.numerator)
+        d_dtype = self.rec(expr.denominator)
+
+        if n_dtype.kind in "iu" and d_dtype.kind in "iu":
+            # both integers
+            return np.dtype(np.float64)
+
+        else:
+            return self.combine([n_dtype, d_dtype])
+
+    def map_constant(self, expr):
+        if is_integer(expr):
+            for tp in [np.int32, np.int64]:
+                iinfo = np.iinfo(tp)
+                if iinfo.min <= expr <= iinfo.max:
+                    return np.dtype(tp)
+
+            else:
+                raise TypeInferenceFailure("integer constant '%s' too large" % expr)
+
+        dt = np.asarray(expr).dtype
+        if hasattr(expr, "dtype"):
+            return expr.dtype
+        elif isinstance(expr, np.number):
+            # Numpy types are sized
+            return np.dtype(type(expr))
+        elif dt.kind == "f":
+            # deduce the smaller type by default
+            return np.dtype(np.float32)
+        elif dt.kind == "c":
+            if np.complex64(expr) == np.complex128(expr):
+                # (COMPLEX_GUESS_LOGIC)
+                # No precision is lost by 'guessing' single precision, use that.
+                # This at least covers simple cases like '1j'.
+                return np.dtype(np.complex64)
+
+            # Codegen for complex types depends on exactly correct types.
+            # Refuse temptation to guess.
+            raise TypeInferenceFailure("Complex constant '%s' needs to "
+                    "be sized for type inference " % expr)
+        else:
+            raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
+
+    def map_subscript(self, expr):
+        return self.rec(expr.aggregate)
+
+    def map_linear_subscript(self, expr):
+        return self.rec(expr.aggregate)
+
+    def map_call(self, expr):
+        from pymbolic.primitives import Variable
+
+        identifier = expr.function
+        if isinstance(identifier, Variable):
+            identifier = identifier.name
+
+        arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
+
+        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
+        if mangle_result is not None:
+            return mangle_result[0]
+
+        raise RuntimeError("no type inference information on "
+                "function '%s'" % identifier)
+
+    def map_variable(self, expr):
+        if expr.name in self.kernel.all_inames():
+            return self.kernel.index_dtype
+
+        result = self.kernel.mangle_symbol(expr.name)
+        if result is not None:
+            result_dtype, _ = result
+            return result_dtype
+
+        obj = self.new_assignments.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.arg_dict.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.temporary_variables.get(expr.name)
+
+        if obj is None:
+            raise TypeInferenceFailure("name not known in type inference: %s"
+                    % expr.name)
+
+        from loopy.kernel.data import TemporaryVariable, KernelArgument
+        import loopy as lp
+        if isinstance(obj, TemporaryVariable):
+            result = obj.dtype
+            if result is lp.auto:
+                raise DependencyTypeInferenceFailure(
+                        "temporary variable '%s'" % expr.name,
+                        expr.name)
+            else:
+                return result
+
+        elif isinstance(obj, KernelArgument):
+            result = obj.dtype
+            if result is None:
+                raise DependencyTypeInferenceFailure(
+                        "argument '%s'" % expr.name,
+                        expr.name)
+            else:
+                return result
+
+        else:
+            raise RuntimeError("unexpected type inference "
+                    "object type for '%s'" % expr.name)
+
+    map_tagged_variable = map_variable
+
+    def map_lookup(self, expr):
+        agg_result = self.rec(expr.aggregate)
+        dtype, offset = agg_result.fields[expr.name]
+        return dtype
+
+    def map_comparison(self, expr):
+        # "bool" is unusable because OpenCL's bool has indeterminate memory
+        # format.
+        return np.dtype(np.int32)
+
+    map_logical_not = map_comparison
+    map_logical_and = map_comparison
+    map_logical_or = map_comparison
+
+    def map_reduction(self, expr):
+        return expr.operation.result_dtype(
+                self.kernel.target, self.rec(expr.expr), expr.inames)
+
+# }}}
+
+# vim: fdm=marker
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index f169eeef52ef7f029dedcf9412dfa6ef828343d6..1a610b63cb35f4931e41446f00c16a182de8430a 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -22,17 +22,49 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from loopy.diagnostic import LoopyError
+
 
 def f2loopy(source, free_form=True, strict=True,
-        pre_transform_code=None):
+        pre_transform_code=None, pre_transform_code_context=None,
+        use_c_preprocessor=False,
+        file_name="<floopy code>"):
+    if use_c_preprocessor:
+        try:
+            import ply.lex as lex
+            import ply.cpp as cpp
+        except ImportError:
+            raise LoopyError("Using the C preprocessor requires PLY to be installed")
+
+        lexer = lex.lex(cpp)
+
+        from ply.cpp import Preprocessor
+        p = Preprocessor(lexer)
+        p.parse(source, file_name)
+
+        tokens = []
+        while True:
+            tok = p.token()
+
+            if not tok:
+                break
+
+            if tok.type == "CPP_COMMENT":
+                continue
+
+            tokens.append(tok.value)
+
+        source = "".join(tokens)
+
     from fparser import api
     tree = api.parse(source, isfree=free_form, isstrict=strict,
             analyze=False, ignore_comments=False)
 
     from loopy.frontend.fortran.translator import F2LoopyTranslator
-    f2loopy = F2LoopyTranslator()
+    f2loopy = F2LoopyTranslator(file_name)
     f2loopy(tree)
 
-    return f2loopy.make_kernels(pre_transform_code=pre_transform_code)
+    return f2loopy.make_kernels(pre_transform_code=pre_transform_code,
+            pre_transform_code_context=pre_transform_code_context)
 
 # vim: foldmethod=marker
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index a6b5b422bcaa413033ecce4808be581139394876..abd925c14fe9ebf14342ac5ac25e2062291feefc 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -214,7 +214,7 @@ def remove_common_indentation(lines):
 # {{{ translator
 
 class F2LoopyTranslator(FTreeWalkerBase):
-    def __init__(self):
+    def __init__(self, filename):
         FTreeWalkerBase.__init__(self)
 
         self.scope_stack = []
@@ -234,6 +234,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.transform_code_lines = []
 
+        self.filename = filename
+
     def add_expression_instruction(self, lhs, rhs):
         scope = self.scope_stack[-1]
 
@@ -331,7 +333,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         tp = self.dtype_from_stmt(node)
 
-        for name, shape in self.parse_dimension_specs(node.entity_decls):
+        for name, shape in self.parse_dimension_specs(node, node.entity_decls):
             if shape is not None:
                 assert name not in scope.dim_map
                 scope.dim_map[name] = shape
@@ -350,7 +352,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
     def map_Dimension(self, node):
         scope = self.scope_stack[-1]
 
-        for name, shape in self.parse_dimension_specs(node.items):
+        for name, shape in self.parse_dimension_specs(node, node.items):
             if shape is not None:
                 assert name not in scope.dim_map
                 scope.dim_map[name] = shape
@@ -369,7 +371,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         for name, data in node.stmts:
             name, = name
             assert name not in scope.data
-            scope.data[name] = [self.parse_expr(i) for i in data]
+            scope.data[name] = [self.parse_expr(node, i) for i in data]
 
         return []
 
@@ -399,7 +401,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         scope = self.scope_stack[-1]
 
         lhs = scope.process_expression_for_loopy(
-                self.parse_expr(node.variable))
+                self.parse_expr(node, node.variable))
         from pymbolic.primitives import Subscript, Call
         if isinstance(lhs, Call):
             raise TranslationError("function call (to '%s') on left hand side of"
@@ -411,7 +413,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         scope.use_name(lhs_name)
 
-        rhs = scope.process_expression_for_loopy(self.parse_expr(node.expr))
+        rhs = scope.process_expression_for_loopy(self.parse_expr(node, node.expr))
 
         self.add_expression_instruction(lhs, rhs)
 
@@ -425,9 +427,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         raise NotImplementedError("save")
 
     def map_Line(self, node):
-        #from warnings import warn
-        #warn("Encountered a 'line': %s" % node)
-        raise NotImplementedError
+        pass
 
     def map_Program(self, node):
         raise NotImplementedError
@@ -467,7 +467,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         cond_var = var(cond_name)
 
         self.add_expression_instruction(
-                cond_var, self.parse_expr(node.expr))
+                cond_var, self.parse_expr(node, node.expr))
 
         self.conditions.append(cond_name)
 
@@ -489,6 +489,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
             loop_var = loop_var.strip()
             scope.use_name(loop_var)
             loop_bounds = self.parse_expr(
+                    node,
                     loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS)
 
             if len(loop_bounds) == 2:
@@ -627,12 +628,16 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
     # }}}
 
-    def make_kernels(self, pre_transform_code=None):
+    def make_kernels(self, pre_transform_code=None, pre_transform_code_context=None):
         kernel_names = [
                 sub.subprogram_name
                 for sub in self.kernels]
 
-        proc_dict = {}
+        if pre_transform_code_context is None:
+            proc_dict = {}
+        else:
+            proc_dict = pre_transform_code_context.copy()
+
         proc_dict["lp"] = lp
         proc_dict["np"] = np
 
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index 4291d98749cb84bf743c3e8c745052190000cd03..b1df6e3d01317a315354cf10a55b9312090dc61f 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 import re
 
+from loopy.diagnostic import LoopyError
+
 
 class FTreeWalkerBase(object):
     def __init__(self):
@@ -53,13 +55,13 @@ class FTreeWalkerBase(object):
             r"^(?P<name>[_0-9a-zA-Z]+)"
             "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
-    def parse_dimension_specs(self, dim_decls):
+    def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
             start_end = bounds_str.split(":")
 
             assert 1 <= len(start_end) <= 2
 
-            return [self.parse_expr(s) for s in start_end]
+            return [self.parse_expr(node, s) for s in start_end]
 
         for decl in dim_decls:
             entity_match = self.ENTITY_RE.match(decl)
@@ -81,8 +83,13 @@ class FTreeWalkerBase(object):
 
     # {{{ expressions
 
-    def parse_expr(self, expr_str, **kwargs):
-        return self.expr_parser(expr_str, **kwargs)
+    def parse_expr(self, node, expr_str, **kwargs):
+        try:
+            return self.expr_parser(expr_str, **kwargs)
+        except Exception as e:
+            raise LoopyError(
+                    "Error parsing expression '%s' on line %d of '%s': %s"
+                    % (expr_str, node.item.span[0], self.filename, str(e)))
 
     # }}}
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6..b876513f754209f7ff498f4b9243a9ac54cce7f8 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -137,7 +137,8 @@ def iname_rel_aff(space, iname, rel, aff):
         raise ValueError("unknown value of 'rel': %s" % rel)
 
 
-def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context):
+def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context,
+        prefer_constants):
     if context is not None:
         context = isl.align_spaces(context, pw_aff.get_domain_space(),
                 obj_bigger_ok=True).params()
@@ -163,12 +164,21 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context)
 
     # put constant bounds with unbounded validity first
     # FIXME: Heuristi-hack.
-    order = [
-            (True, False),  # constant, unbounded validity
-            (False, False),  # nonconstant, unbounded validity
-            (True, True),  # constant, bounded validity
-            (False, True),  # nonconstant, bounded validity
-            ]
+    if prefer_constants:
+        order = [
+                (True, False),  # constant, unbounded validity
+                (False, False),  # nonconstant, unbounded validity
+                (True, True),  # constant, bounded validity
+                (False, True),  # nonconstant, bounded validity
+                ]
+    else:
+        order = [
+                (False, False),  # nonconstant, unbounded validity
+                (True, False),  # constant, unbounded validity
+                (False, True),  # nonconstant, bounded validity
+                (True, True),  # constant, bounded validity
+                ]
+
     pieces = flatten([
             [(set, aff) for set, aff in pieces
                 if aff.is_cst() == want_is_constant
@@ -199,19 +209,22 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context)
             % (what, pw_aff))
 
 
-def static_min_of_pw_aff(pw_aff, constants_only, context=None):
+def static_min_of_pw_aff(pw_aff, constants_only, context=None,
+        prefer_constants=True):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.ge_set,
-            "minimum", context)
+            "minimum", context, prefer_constants)
 
 
-def static_max_of_pw_aff(pw_aff, constants_only, context=None):
+def static_max_of_pw_aff(pw_aff, constants_only, context=None,
+        prefer_constants=True):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.le_set,
-            "maximum", context)
+            "maximum", context, prefer_constants)
 
 
-def static_value_of_pw_aff(pw_aff, constants_only, context=None):
+def static_value_of_pw_aff(pw_aff, constants_only, context=None,
+        prefer_constants=True):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set,
-            "value", context)
+            "value", context, prefer_constants)
 
 
 def duplicate_axes(isl_obj, duplicate_inames, new_inames):
@@ -363,4 +376,12 @@ def simplify_via_aff(expr):
         expr))
 
 
+def project_out(set, inames):
+    for iname in inames:
+        var_dict = set.get_var_dict()
+        dt, dim_idx = var_dict[iname]
+        set = set.project_out(dt, dim_idx, 1)
+
+    return set
+
 # vim: foldmethod=marker
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e99dedbf9b4ec36dfebe082da970ff260e2fb27e..7edf43e43bceb6ee3622f3645ad37c4375a703e0 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -172,7 +172,7 @@ def parse_insn(insn):
     elif subst_match is not None:
         groups = subst_match.groupdict()
     else:
-        raise RuntimeError("isntruction parse error at '%s'" % insn)
+        raise RuntimeError("instruction parse error at '%s'" % insn)
 
     from loopy.symbolic import parse
     try:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index ab747b9a66f82044e0fe7dc4591867fdd34d3519..32c062689b47acd688933a1f615af3cd6f67135a 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -747,34 +747,6 @@ class ExpressionInstruction(InstructionBase):
 # }}}
 
 
-def _remove_common_indentation(code):
-    if "\n" not in code:
-        return code
-
-    # accommodate pyopencl-ish syntax highlighting
-    code = code.lstrip("//CL//")
-
-    if not code.startswith("\n"):
-        return code
-
-    lines = code.split("\n")
-    while lines[0].strip() == "":
-        lines.pop(0)
-    while lines[-1].strip() == "":
-        lines.pop(-1)
-
-    if lines:
-        base_indent = 0
-        while lines[0][base_indent] in " \t":
-            base_indent += 1
-
-        for line in lines[1:]:
-            if line[:base_indent].strip():
-                raise ValueError("inconsistent indentation")
-
-    return "\n".join(line[base_indent:] for line in lines)
-
-
 # {{{ c instruction
 
 class CInstruction(InstructionBase):
@@ -873,7 +845,8 @@ class CInstruction(InstructionBase):
         # }}}
 
         self.iname_exprs = new_iname_exprs
-        self.code = _remove_common_indentation(code)
+        from loopy.tools import remove_common_indentation
+        self.code = remove_common_indentation(code)
         self.read_variables = read_variables
         self.assignees = new_assignees
 
diff --git a/loopy/precompute.py b/loopy/precompute.py
index 5675537f1f3915dcce8b68b2d36200bdcaef3f5a..935d6d44040cf56b3ca3bdbbec26e22ec750a05d 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -473,7 +473,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         if precompute_inames is not None and i < len(precompute_inames):
             name = precompute_inames[i]
             tag_lookup_saxis = name
-            if not precompute_inames_already_exist and var_name_gen.is_name_conflicting(name):
+            if (not precompute_inames_already_exist
+                    and var_name_gen.is_name_conflicting(name)):
                 raise RuntimeError("new storage axis name '%s' "
                         "conflicts with existing name" % name)
 
@@ -543,8 +544,34 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                         domch.domain, non1_storage_axis_names,
                         boxify_sweep=fetch_bounding_box))
         else:
-            new_kernel_domains = domch.get_domains_with(domch.domain)
+            check_domain = domch.domain
+
+            # {{{ check the domain the preexisting inames' domain
+
+            # inames already exist in check_domain, add them primed
+            primed_non1_saxis_names = [
+                    iname+"'" for iname in non1_storage_axis_names]
+
+            check_domain = abm.augment_domain_with_sweep(
+                check_domain, primed_non1_saxis_names,
+                boxify_sweep=fetch_bounding_box)
+
+            # project out the original copies
+            from loopy.isl_helpers import project_out
+            check_domain = project_out(check_domain, non1_storage_axis_names)
 
+            for iname in non1_storage_axis_names:
+                var_dict = check_domain.get_var_dict()
+                dt, dim_idx = var_dict[iname+"'"]
+                check_domain = check_domain.set_dim_name(dt, dim_idx, iname)
+
+            if not (check_domain <= domch.domain and domch.domain <= check_domain):
+                raise LoopyError("domain of preexisting inames does not match "
+                        "domain needed for precompute")
+
+            # }}}
+
+            new_kernel_domains = domch.get_domains_with(domch.domain)
 
     else:
         # leave kernel domains unchanged
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f9cd3f3eb2140354decf61b475ba1742d5ea6832..9de31233b831215bba72ba1e226b0df555b42286 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -162,7 +162,7 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     # }}}
 
-    from loopy.codegen.expression import TypeInferenceMapper
+    from loopy.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel,
             _DictUnionView([
                 new_temp_vars,
@@ -401,7 +401,7 @@ def realize_reduction(kernel, insn_id_filter=None):
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
 
-    from loopy.codegen.expression import TypeInferenceMapper
+    from loopy.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
     def map_reduction(expr, rec):
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8897f35938ee29e50d70edfdc52945bc2ad856ac..bad7f840f648bc72d7f505fb72a200f3df2fb0d6 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -305,7 +305,27 @@ class Reduction(AlgebraicLeaf):
     init_arg_names = ("operation", "inames", "expr")
 
     def __init__(self, operation, inames, expr):
+        if isinstance(inames, str):
+            inames = tuple(iname.strip() for iname in inames.split(","))
+
+        elif isinstance(inames, Variable):
+            inames = (inames,)
+
         assert isinstance(inames, tuple)
+
+        def strip_var(iname):
+            if isinstance(iname, Variable):
+                iname = iname.name
+
+            assert isinstance(iname, str)
+            return iname
+
+        inames = tuple(strip_var(iname) for iname in inames)
+
+        if isinstance(operation, str):
+            from loopy.library.reduction import parse_reduction_op
+            operation = parse_reduction_op(operation)
+
         from loopy.library.reduction import ReductionOperation
         assert isinstance(operation, ReductionOperation)
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b0d578b4871bc6d51177b6f9d6f6edd893db1a
--- /dev/null
+++ b/loopy/target/c/__init__.py
@@ -0,0 +1,114 @@
+"""OpenCL target independent of PyOpenCL."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+
+import numpy as np  # noqa
+from loopy.target import TargetBase
+
+from pytools import memoize_method
+
+
+class CTarget(TargetBase):
+    @memoize_method
+    def get_dtype_registry(self):
+        from loopy.target.c.compyte.dtypes import (
+                DTypeRegistry, fill_with_registry_with_c_types)
+        result = DTypeRegistry()
+        fill_with_registry_with_c_types(result, respect_windows=False,
+                include_bool=True)
+        return result
+
+    def is_vector_dtype(self, dtype):
+        return False
+
+    def get_vector_dtype(self, base, count):
+        raise KeyError()
+
+    def get_or_register_dtype(self, names, dtype=None):
+        return self.get_dtype_registry().get_or_register_dtype(names, dtype)
+
+    def dtype_to_typename(self, dtype):
+        return self.get_dtype_registry().dtype_to_ctype(dtype)
+
+    def get_expression_to_code_mapper(self, kernel,
+            seen_dtypes, seen_functions, allow_complex):
+        from loopy.target.c.codegen.expression import LoopyCCodeMapper
+        return (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions,
+            allow_complex=allow_complex))
+
+    # {{{ code generation
+
+    def generate_code(self, kernel, codegen_state, impl_arg_info):
+        from cgen import FunctionBody, FunctionDeclaration, Value, Module
+
+        body, implemented_domains = kernel.target.generate_body(
+                kernel, codegen_state)
+
+        mod = Module([
+            FunctionBody(
+                kernel.target.wrap_function_declaration(
+                    kernel,
+                    FunctionDeclaration(
+                        Value("void", kernel.name),
+                        [iai.cgen_declarator for iai in impl_arg_info])),
+                body)
+            ])
+
+        return str(mod), implemented_domains
+
+    def wrap_function_declaration(self, kernel, fdecl):
+        return fdecl
+
+    def generate_body(self, kernel, codegen_state):
+        from cgen import Block
+        body = Block()
+
+        # {{{ declare temporaries
+
+        body.extend(
+                idi.cgen_declarator
+                for tv in six.itervalues(kernel.temporary_variables)
+                for idi in tv.decl_info(
+                    kernel.target,
+                    is_written=True, index_dtype=kernel.index_dtype))
+
+        # }}}
+
+        from loopy.codegen.loop import set_up_hw_parallel_loops
+        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)
+
+        from cgen import Line
+        body.append(Line())
+
+        if isinstance(gen_code.ast, Block):
+            body.extend(gen_code.ast.contents)
+        else:
+            body.append(gen_code.ast)
+
+        return body, gen_code.implemented_domains
+
+    # }}}
diff --git a/loopy/target/c/codegen/__init__.py b/loopy/target/c/codegen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/loopy/codegen/expression.py b/loopy/target/c/codegen/expression.py
similarity index 77%
rename from loopy/codegen/expression.py
rename to loopy/target/c/codegen/expression.py
index f834c8fa50287c14aec1417ebef093dffd5062d9..1dc3aafcfb63789255a7f5fd79b178e657b3c74d 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -31,238 +31,11 @@ import numpy as np
 from pymbolic.mapper import RecursiveMapper
 from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
         PREC_POWER)
-from pymbolic.mapper import CombineMapper
 import islpy as isl
-from pytools import Record
 
-from loopy.tools import is_integer
-from loopy.diagnostic import TypeInferenceFailure, DependencyTypeInferenceFailure
-
-
-# {{{ type inference
-
-class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, new_assignments=None):
-        """
-        :arg new_assignments: mapping from names to either
-            :class:`loopy.kernel.data.TemporaryVariable`
-            or
-            :class:`loopy.kernel.data.KernelArgument`
-            instances
-        """
-        self.kernel = kernel
-        if new_assignments is None:
-            new_assignments = {}
-        self.new_assignments = new_assignments
-
-    # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
-    # are Python-equal (for many common constants such as integers).
-
-    @staticmethod
-    def combine(dtypes):
-        dtypes = list(dtypes)
-
-        result = dtypes.pop()
-        while dtypes:
-            other = dtypes.pop()
-
-            if result.isbuiltin and other.isbuiltin:
-                if (result, other) in [
-                        (np.int32, np.float32), (np.int32, np.float32)]:
-                    # numpy makes this a double. I disagree.
-                    result = np.dtype(np.float32)
-                else:
-                    result = (
-                            np.empty(0, dtype=result)
-                            + np.empty(0, dtype=other)
-                            ).dtype
-            elif result.isbuiltin and not other.isbuiltin:
-                # assume the non-native type takes over
-                result = other
-            elif not result.isbuiltin and other.isbuiltin:
-                # assume the non-native type takes over
-                pass
-            else:
-                if result is not other:
-                    raise TypeInferenceFailure(
-                            "nothing known about result of operation on "
-                            "'%s' and '%s'" % (result, other))
-
-        return result
-
-    def map_sum(self, expr):
-        dtypes = []
-        small_integer_dtypes = []
-        for child in expr.children:
-            dtype = self.rec(child)
-            if is_integer(child) and abs(child) < 1024:
-                small_integer_dtypes.append(dtype)
-            else:
-                dtypes.append(dtype)
-
-        from pytools import all
-        if all(dtype.kind == "i" for dtype in dtypes):
-            dtypes.extend(small_integer_dtypes)
-
-        return self.combine(dtypes)
-
-    map_product = map_sum
-
-    def map_quotient(self, expr):
-        n_dtype = self.rec(expr.numerator)
-        d_dtype = self.rec(expr.denominator)
-
-        if n_dtype.kind in "iu" and d_dtype.kind in "iu":
-            # both integers
-            return np.dtype(np.float64)
-
-        else:
-            return self.combine([n_dtype, d_dtype])
-
-    def map_constant(self, expr):
-        if is_integer(expr):
-            for tp in [np.int32, np.int64]:
-                iinfo = np.iinfo(tp)
-                if iinfo.min <= expr <= iinfo.max:
-                    return np.dtype(tp)
-
-            else:
-                raise TypeInferenceFailure("integer constant '%s' too large" % expr)
-
-        dt = np.asarray(expr).dtype
-        if hasattr(expr, "dtype"):
-            return expr.dtype
-        elif isinstance(expr, np.number):
-            # Numpy types are sized
-            return np.dtype(type(expr))
-        elif dt.kind == "f":
-            # deduce the smaller type by default
-            return np.dtype(np.float32)
-        elif dt.kind == "c":
-            if np.complex64(expr) == np.complex128(expr):
-                # (COMPLEX_GUESS_LOGIC)
-                # No precision is lost by 'guessing' single precision, use that.
-                # This at least covers simple cases like '1j'.
-                return np.dtype(np.complex64)
-
-            # Codegen for complex types depends on exactly correct types.
-            # Refuse temptation to guess.
-            raise TypeInferenceFailure("Complex constant '%s' needs to "
-                    "be sized for type inference " % expr)
-        else:
-            raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
-
-    def map_subscript(self, expr):
-        return self.rec(expr.aggregate)
-
-    def map_linear_subscript(self, expr):
-        return self.rec(expr.aggregate)
-
-    def map_call(self, expr):
-        from pymbolic.primitives import Variable
-
-        identifier = expr.function
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
-
-        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if mangle_result is not None:
-            return mangle_result[0]
+from loopy.expression import dtype_to_type_context, TypeInferenceMapper
 
-        raise RuntimeError("no type inference information on "
-                "function '%s'" % identifier)
-
-    def map_variable(self, expr):
-        if expr.name in self.kernel.all_inames():
-            return self.kernel.index_dtype
-
-        result = self.kernel.mangle_symbol(expr.name)
-        if result is not None:
-            result_dtype, _ = result
-            return result_dtype
-
-        obj = self.new_assignments.get(expr.name)
-
-        if obj is None:
-            obj = self.kernel.arg_dict.get(expr.name)
-
-        if obj is None:
-            obj = self.kernel.temporary_variables.get(expr.name)
-
-        if obj is None:
-            raise TypeInferenceFailure("name not known in type inference: %s"
-                    % expr.name)
-
-        from loopy.kernel.data import TemporaryVariable, KernelArgument
-        import loopy as lp
-        if isinstance(obj, TemporaryVariable):
-            result = obj.dtype
-            if result is lp.auto:
-                raise DependencyTypeInferenceFailure(
-                        "temporary variable '%s'" % expr.name,
-                        expr.name)
-            else:
-                return result
-
-        elif isinstance(obj, KernelArgument):
-            result = obj.dtype
-            if result is None:
-                raise DependencyTypeInferenceFailure(
-                        "argument '%s'" % expr.name,
-                        expr.name)
-            else:
-                return result
-
-        else:
-            raise RuntimeError("unexpected type inference "
-                    "object type for '%s'" % expr.name)
-
-    map_tagged_variable = map_variable
-
-    def map_lookup(self, expr):
-        agg_result = self.rec(expr.aggregate)
-        dtype, offset = agg_result.fields[expr.name]
-        return dtype
-
-    def map_comparison(self, expr):
-        # "bool" is unusable because OpenCL's bool has indeterminate memory
-        # format.
-        return np.dtype(np.int32)
-
-    map_logical_not = map_comparison
-    map_logical_and = map_comparison
-    map_logical_or = map_comparison
-
-    def map_reduction(self, expr):
-        return expr.operation.result_dtype(
-                self.kernel.target, self.rec(expr.expr), expr.inames)
-
-# }}}
-
-
-# {{{ C code mapper
-
-# type_context may be:
-# - 'i' for integer -
-# - 'f' for single-precision floating point
-# - 'd' for double-precision floating point
-# or None for 'no known context'.
-
-def dtype_to_type_context(target, dtype):
-    dtype = np.dtype(dtype)
-
-    if dtype.kind == 'i':
-        return 'i'
-    if dtype in [np.float64, np.complex128]:
-        return 'd'
-    if dtype in [np.float32, np.complex64]:
-        return 'f'
-    if target.is_vector_dtype(dtype):
-        return dtype_to_type_context(target, dtype.fields["x"][0])
-
-    return None
+from loopy.tools import is_integer
 
 
 def get_opencl_vec_member(idx):
@@ -273,25 +46,7 @@ def get_opencl_vec_member(idx):
     return "s%s" % hex(int(idx))[2:]
 
 
-class SeenFunction(Record):
-    """
-    .. attribute:: name
-    .. attribute:: c_name
-    .. attribute:: arg_dtypes
-
-        a tuple of arg dtypes
-    """
-
-    def __init__(self, name, c_name, arg_dtypes):
-        Record.__init__(self,
-                name=name,
-                c_name=c_name,
-                arg_dtypes=arg_dtypes)
-
-    def __hash__(self):
-        return hash((type(self),)
-                + tuple((f, getattr(self, f)) for f in type(self).fields))
-
+# {{{ C code mapper
 
 class LoopyCCodeMapper(RecursiveMapper):
     def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={},
@@ -554,6 +309,7 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         def seen_func(name):
             idt = self.kernel.index_dtype
+            from loopy.codegen import SeenFunction
             self.seen_functions.add(SeenFunction(name, name, (idt, idt)))
 
         if den_nonneg:
@@ -675,6 +431,7 @@ class LoopyCCodeMapper(RecursiveMapper):
                         "for function '%s' not understood"
                         % identifier)
 
+        from loopy.codegen import SeenFunction
         self.seen_functions.add(SeenFunction(identifier, c_name, par_dtypes))
         if str_parameters is None:
             # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
new file mode 160000
index 0000000000000000000000000000000000000000..fb6ba114d9d906403d47b0aaf69e2fe4cef382f2
--- /dev/null
+++ b/loopy/target/c/compyte
@@ -0,0 +1 @@
+Subproject commit fb6ba114d9d906403d47b0aaf69e2fe4cef382f2
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
index 04efdedabb6e281b99c8ffae3a5be53e084524ed..e68040c0983d5b6f144ae2b42eaa41899ef56b76 100644
--- a/loopy/target/opencl/__init__.py
+++ b/loopy/target/opencl/__init__.py
@@ -26,44 +26,21 @@ THE SOFTWARE.
 
 import numpy as np
 
-from loopy.target import TargetBase
-
-
-# {{{ type registry
-
-def _register_types():
-    from loopy.target.opencl.compyte.dtypes import (
-            _fill_dtype_registry, get_or_register_dtype)
-    import struct
-
-    _fill_dtype_registry(respect_windows=False, include_bool=False)
-
-    # complex number support left out
-
-    is_64_bit = struct.calcsize('@P') * 8 == 64
-    if not is_64_bit:
-        get_or_register_dtype(
-                ["unsigned long", "unsigned long int"], np.uint64)
-        get_or_register_dtype(
-                ["signed long", "signed long int", "long int"], np.int64)
-
-_register_types()
-
-# }}}
+from loopy.target.c import CTarget
+from pytools import memoize_method
 
 
 # {{{ vector types
 
-class vec:
+class vec:  # noqa
     pass
 
 
 def _create_vector_types():
     field_names = ["x", "y", "z", "w"]
 
-    from loopy.target.opencl.compyte.dtypes import get_or_register_dtype
-
     vec.types = {}
+    vec.names_and_dtypes = []
     vec.type_to_scalar_and_count = {}
 
     counts = [2, 3, 4, 8, 16]
@@ -109,40 +86,20 @@ def _create_vector_types():
                     dtype = np.dtype([(n, base_type) for (n, title)
                                       in zip(names, titles)])
 
-            get_or_register_dtype(name, dtype)
-
             setattr(vec, name, dtype)
 
-            def create_array(dtype, count, padded_count, *args, **kwargs):
-                if len(args) < count:
-                    from warnings import warn
-                    warn("default values for make_xxx are deprecated;"
-                            " instead specify all parameters or use"
-                            " array.vec.zeros_xxx", DeprecationWarning)
-                padded_args = tuple(list(args)+[0]*(padded_count-len(args)))
-                array = eval("array(padded_args, dtype=dtype)",
-                        dict(array=np.array, padded_args=padded_args,
-                        dtype=dtype))
-                for key, val in kwargs.items():
-                    array[key] = val
-                return array
-
-            setattr(vec, "make_"+name, staticmethod(eval(
-                    "lambda *args, **kwargs: create_array(dtype, %i, %i, "
-                    "*args, **kwargs)" % (count, padded_count),
-                    dict(create_array=create_array, dtype=dtype))))
-            setattr(vec, "filled_"+name, staticmethod(eval(
-                    "lambda val: vec.make_%s(*[val]*%i)" % (name, count))))
-            setattr(vec, "zeros_"+name,
-                    staticmethod(eval("lambda: vec.filled_%s(0)" % (name))))
-            setattr(vec, "ones_"+name,
-                    staticmethod(eval("lambda: vec.filled_%s(1)" % (name))))
+            vec.names_and_dtypes.append((name, dtype))
 
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
 _create_vector_types()
 
+
+def _register_vector_types(dtype_registry):
+    for name, dtype in vec.names_and_dtypes:
+        dtype_registry.get_or_register_dtype(name, dtype)
+
 # }}}
 
 
@@ -234,7 +191,7 @@ def opencl_preamble_generator(target, seen_dtypes, seen_functions):
 
 # {{{ target
 
-class OpenCLTarget(TargetBase):
+class OpenCLTarget(CTarget):
     def function_manglers(self):
         return (
                 super(OpenCLTarget, self).function_manglers() + [
@@ -255,13 +212,24 @@ class OpenCLTarget(TargetBase):
                     reduction_preamble_generator
                     ])
 
-    def get_or_register_dtype(self, names, dtype=None):
-        from loopy.target.opencl.compyte.dtypes import get_or_register_dtype
-        return get_or_register_dtype(names, dtype)
+    @memoize_method
+    def get_dtype_registry(self):
+        from loopy.target.c.compyte import (
+                DTypeRegistry, fill_with_registry_with_c_types)
+        result = DTypeRegistry()
+        fill_with_registry_with_c_types(result)
+
+        # complex number support left out
 
-    def dtype_to_typename(self, dtype):
-        from loopy.target.opencl.compyte.dtypes import dtype_to_ctype
-        return dtype_to_ctype(dtype)
+        # CL defines 'long' as 64-bit
+        result.get_or_register_dtype(
+                ["unsigned long", "unsigned long int"], np.uint64)
+        result.get_or_register_dtype(
+                ["signed long", "signed long int", "long int"], np.int64)
+
+        _register_vector_types(result)
+
+        return result
 
     def is_vector_dtype(self, dtype):
         return list(vec.types.values())
@@ -269,6 +237,43 @@ class OpenCLTarget(TargetBase):
     def get_vector_dtype(self, base, count):
         return vec.types[base, count]
 
+    def wrap_function_declaration(self, kernel, fdecl):
+        from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
+        return CLRequiredWorkGroupSize(
+                kernel.get_grid_sizes_as_exprs()[1],
+                CLKernel(fdecl))
+
+    def generate_code(self, kernel, codegen_state, impl_arg_info):
+        code, implemented_domains = (
+                super(OpenCLTarget, self).generate_code(
+                    kernel, codegen_state, impl_arg_info))
+
+        from loopy.tools import remove_common_indentation
+        code = (
+                remove_common_indentation("""
+                    #define lid(N) ((%(idx_ctype)s) get_local_id(N))
+                    #define gid(N) ((%(idx_ctype)s) get_group_id(N))
+                    """ % dict(idx_ctype=self.dtype_to_typename(kernel.index_dtype)))
+                + "\n\n"
+                + code)
+
+        return code, implemented_domains
+
+    def generate_body(self, kernel, codegen_state):
+        body, implemented_domains = (
+                super(OpenCLTarget, self).generate_body(kernel, codegen_state))
+
+        from loopy.kernel.data import ImageArg
+
+        if any(isinstance(arg, ImageArg) for arg in kernel.args):
+            from cgen import Value, Const, Initializer
+            body.contents.insert(0,
+                    Initializer(Const(Value("sampler_t", "loopy_sampler")),
+                        "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
+                        "| CLK_FILTER_NEAREST"))
+
+        return body, implemented_domains
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/opencl/compyte b/loopy/target/opencl/compyte
deleted file mode 160000
index 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23..0000000000000000000000000000000000000000
--- a/loopy/target/opencl/compyte
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index a1b323d7cf0deaef308952d87b1523b0a020ad68..a0a119dce42e1095ff327f38f6b836f80091b159 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -259,13 +259,9 @@ class PyOpenCLTarget(OpenCLTarget):
     def pre_codegen_check(self, kernel):
         check_sizes(kernel, self.device)
 
-    def get_or_register_dtype(self, names, dtype=None):
-        from pyopencl.compyte.dtypes import get_or_register_dtype
-        return get_or_register_dtype(names, dtype)
-
-    def dtype_to_typename(self, dtype):
-        from pyopencl.compyte.dtypes import dtype_to_ctype
-        return dtype_to_ctype(dtype)
+    def get_dtype_registry(self):
+        from pyopencl.compyte.dtypes import TYPE_REGISTRY
+        return TYPE_REGISTRY
 
     def is_vector_dtype(self, dtype):
         from pyopencl.array import vec
diff --git a/loopy/tools.py b/loopy/tools.py
index 0c8898bb1ce20a732dbd0d64a679e18c98609edb..1f47160677c5255539a3a6c098c9ba6d8b89a18b 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -98,6 +98,8 @@ class LoopyKeyBuilder(KeyBuilderBase):
 # }}}
 
 
+# {{{ picklable dtype
+
 class PicklableDtype(object):
     """This object works around several issues with pickling :class:`numpy.dtype`
     objects. It does so by serving as a picklable wrapper around the original
@@ -157,4 +159,38 @@ class PicklableDtype(object):
     def assert_has_target(self):
         assert self.target is not None
 
+# }}}
+
+
+# {{{ remove common indentation
+
+def remove_common_indentation(code):
+    if "\n" not in code:
+        return code
+
+    # accommodate pyopencl-ish syntax highlighting
+    code = code.lstrip("//CL//")
+
+    if not code.startswith("\n"):
+        return code
+
+    lines = code.split("\n")
+    while lines[0].strip() == "":
+        lines.pop(0)
+    while lines[-1].strip() == "":
+        lines.pop(-1)
+
+    if lines:
+        base_indent = 0
+        while lines[0][base_indent] in " \t":
+            base_indent += 1
+
+        for line in lines[1:]:
+            if line[:base_indent].strip():
+                raise ValueError("inconsistent indentation")
+
+    return "\n".join(line[base_indent:] for line in lines)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/requirements.txt b/requirements.txt
index 708b72c8d508c979f069d41b645da18b559ecfd5..b833d6dd0d989ad3b88aedb3d09cf21766536613 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,6 @@ git+git://github.com/pyopencl/pyopencl
 git+git://github.com/inducer/pymbolic
 
 hg+https://bitbucket.org/inducer/f2py
+
+# Optional, needed for using the C preprocessor on Fortran
+ply>=3.6
diff --git a/setup.cfg b/setup.cfg
index 6faef2e65abe138f9ab22c94452c43be0e52c075..d3f13a0e64b79c00a957cb1369e335e0b8a00d76 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802
 max-line-length=85
diff --git a/test/test_loopy.py b/test/test_loopy.py
index f99c0b3b3638d024530c339e4340e27d71043c4e..2173347cadaea66a4ec53d78fb8c77abb24cc7bb 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1958,6 +1958,112 @@ def test_auto_test_can_detect_problems(ctx_factory):
                 parameters=dict(n=123))
 
 
+def test_generate_c_snippet():
+    from loopy.target.c import CTarget
+
+    from pymbolic import var
+    I = var("I")  # noqa
+    f = var("f")
+    df = var("df")
+    q_v = var("q_v")
+    eN = var("eN")  # noqa
+    k = var("k")
+    u = var("u")
+
+    from functools import partial
+    l_sum = partial(lp.Reduction, "sum")
+
+    Instr = lp.ExpressionInstruction  # noqa
+
+    knl = lp.make_kernel(
+        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
+        [
+            Instr(f[I], l_sum(k, q_v[k, I]*u)),
+            Instr(df[I], l_sum(k, q_v[k, I])),
+            ],
+        [
+            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
+            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
+            lp.ValueArg("u", np.float64),
+            "...",
+            ],
+        target=CTarget(),
+        assumptions="nQuad>=1")
+
+    if 0:  # enable to play with prefetching
+        # (prefetch currently requires constant sizes)
+        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
+        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)
+
+    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
+    knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    print(lp.generate_body(knl))
+
+
+def test_precompute_with_preexisting_inames(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
+        """
+        result[e,i] = sum(j, D1[i,j]*u[e,j])
+        result2[e,i] = sum(k, D2[i,k]*u[e,k])
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        "u": np.float32,
+        "D1": np.float32,
+        "D2": np.float32,
+        })
+
+    knl = lp.fix_parameters(knl, n=13)
+
+    ref_knl = knl
+
+    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
+    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
+
+    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
+            precompute_inames="ii,jj")
+    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
+            precompute_inames="ii,jj")
+
+    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(E=200))
+
+
+def test_precompute_with_preexisting_inames_fail():
+    knl = lp.make_kernel(
+        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
+        """
+        result[e,i] = sum(j, D1[i,j]*u[e,j])
+        result2[e,i] = sum(k, D2[i,k]*u[e,k])
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        "u": np.float32,
+        "D1": np.float32,
+        "D2": np.float32,
+        })
+
+    knl = lp.fix_parameters(knl, n=13)
+
+    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
+    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
+
+    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
+            precompute_inames="ii,jj")
+    with pytest.raises(lp.LoopyError):
+        lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
+                precompute_inames="ii,jj")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])