diff --git a/.gitignore b/.gitignore
index 8c42a2c713dda5ef85e52503120dde5a04af2fa6..322e3a7caa7f229bccba899cb9bc993d95364b86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@ distribute*tar.gz
 *.log
 *profiler.conf
 core
+.coverage
+htmlcov
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..504e23cf344e2d5ae35f6f6abe97458b8c7a39b8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "loopy/target/opencl/compyte"]
+	path = loopy/target/opencl/compyte
+	url = https://github.com/inducer/compyte
diff --git a/README.rst b/README.rst
index 6390e8afa4988d7af778375c633a438372d5220e..0e551fbede0460a2e7c76167b54d672afdf81286 100644
--- a/README.rst
+++ b/README.rst
@@ -43,6 +43,11 @@ It is not (and does not want to be) a general-purpose programming language.
 
 Loopy is licensed under the liberal `MIT license
 <http://en.wikipedia.org/wiki/MIT_License>`_ and free for commercial, academic,
-and private use. To use Loopy, you just need `pyopencl
-<http://mathema.tician.de/software/pyopencl>`_.
+and private use. All of Loopy's dependencies can be automatically installed from
+the package index after using::
+
+    pip install loo.py
+
+In addition, Loopy is compatible with and enhances
+`pyopencl <http://mathema.tician.de/software/pyopencl>`_.
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index bb397db32224fc761b2b5bbeed35f9c1fbf834da..6ef10b3408e9954b16c24af88ec526e59158da0d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -37,12 +37,7 @@ from loopy.diagnostic import LoopyError
 # {{{ imported user interface
 
 from loopy.library.function import (
-        default_function_mangler, single_arg_function_mangler,
-        opencl_function_mangler)
-
-from loopy.library.preamble import default_preamble_generator
-
-from loopy.library.symbol import opencl_symbol_mangler
+        default_function_mangler, single_arg_function_mangler)
 
 from loopy.kernel.data import (
         auto,
@@ -81,8 +76,6 @@ __all__ = [
         "ExpressionInstruction", "CInstruction",
 
         "default_function_mangler", "single_arg_function_mangler",
-        "opencl_function_mangler", "opencl_symbol_mangler",
-        "default_preamble_generator",
 
         "make_kernel", "UniqueName",
 
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index cc36f3c525194fbc3418b0962835c9ca77af8929..f73183526f29eb9c559d9479e5d1bddf322e45f5 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -420,7 +420,7 @@ def auto_test_vs_ref(
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-        pp_ref_knl = lp.preprocess_kernel(ref_knl, device=dev)
+        pp_ref_knl = lp.preprocess_kernel(ref_knl)
 
         for knl in lp.generate_loop_schedules(pp_ref_knl):
             ref_sched_kernel = knl
@@ -514,7 +514,7 @@ def auto_test_vs_ref(
         if test_knl.state not in [
                 kernel_state.PREPROCESSED,
                 kernel_state.SCHEDULED]:
-            test_knl = lp.preprocess_kernel(test_knl, device=ctx.devices[0])
+            test_knl = lp.preprocess_kernel(test_knl)
 
         if not test_knl.schedule:
             test_kernels = lp.generate_loop_schedules(test_knl)
diff --git a/loopy/check.py b/loopy/check.py
index d3fecda62d69de5ba134a16556cc8cdbce5a7ccf..09ca0c576b6b3a2bca8cc19b384c8ff56e8d20c2 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -67,7 +67,7 @@ def check_insn_attributes(kernel):
 
 def check_loop_priority_inames_known(kernel):
     for iname in kernel.loop_priority:
-        if not iname in kernel.all_inames():
+        if iname not in kernel.all_inames():
             raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
@@ -376,54 +376,6 @@ def pre_schedule_checks(kernel):
 
 # {{{ pre-code-generation checks
 
-def check_sizes(kernel, device):
-    import loopy as lp
-
-    from loopy.diagnostic import LoopyAdvisory
-
-    parameters = {}
-    for arg in kernel.args:
-        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
-            parameters[arg.name] = arg.approximately
-
-    glens, llens = kernel.get_grid_sizes_as_exprs()
-
-    if (max(len(glens), len(llens))
-            > device.max_work_item_dimensions):
-        raise LoopyError("too many work item dimensions")
-
-    from pymbolic import evaluate
-    from pymbolic.mapper.evaluator import UnknownVariableError
-    try:
-        glens = evaluate(glens, parameters)
-        llens = evaluate(llens, parameters)
-    except UnknownVariableError as name:
-        from warnings import warn
-        warn("could not check axis bounds because no value "
-                "for variable '%s' was passed to check_kernels()"
-                % name, LoopyAdvisory)
-    else:
-        for i in range(len(llens)):
-            if llens[i] > device.max_work_item_sizes[i]:
-                raise LoopyError("group axis %d too big" % i)
-
-        from pytools import product
-        if product(llens) > device.max_work_group_size:
-            raise LoopyError("work group too big")
-
-    from pyopencl.characterize import usable_local_mem_size
-    if kernel.local_mem_use() > usable_local_mem_size(device):
-        raise LoopyError("using too much local memory")
-
-    from loopy.kernel.data import ConstantArg
-    const_arg_count = sum(
-            1 for arg in kernel.args
-            if isinstance(arg, ConstantArg))
-
-    if const_arg_count > device.max_constant_args:
-        raise LoopyError("too many constant arguments")
-
-
 def check_that_shapes_and_strides_are_arguments(kernel):
     from loopy.kernel.data import ValueArg
     from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
@@ -460,18 +412,11 @@ def check_that_shapes_and_strides_are_arguments(kernel):
                                     arg.name, ", ".join(deps-integer_arg_names)))
 
 
-def pre_codegen_checks(kernel, device=None):
+def pre_codegen_checks(kernel):
     try:
         logger.info("pre-codegen check %s: start" % kernel.name)
 
-        if device is not None:
-            check_sizes(kernel, device)
-        else:
-            from loopy.diagnostic import warn
-            warn(kernel, "no_device_in_pre_codegen_checks",
-                    "No device parameter was passed to loopy.pre_codegen_checks. "
-                    "Perhaps you want to pass a device argument to generate_code.")
-
+        kernel.target.pre_codegen_check(kernel)
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.info("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 0f645c2dde3707890fa33b06b7ac9f064d437ecb..c14c1ffa24157a51c58b31a166913e94a558e062 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -220,11 +220,12 @@ class POD(Declarator):
     and the *name* is given as a string.
     """
 
-    def __init__(self, dtype, name):
+    def __init__(self, target, dtype, name):
         dtype = np.dtype(dtype)
 
-        from pyopencl.tools import dtype_to_ctype
-        self.ctype = dtype_to_ctype(dtype)
+        self.target = target
+        self.ctype = target.dtype_to_typename(dtype)
+        self.dtype = dtype
         self.name = name
 
     def get_decl_pair(self):
@@ -233,17 +234,11 @@ class POD(Declarator):
     def struct_maker_code(self, name):
         return name
 
-    @property
-    def dtype(self):
-        from pyopencl.tools import NAME_TO_DTYPE
-        return NAME_TO_DTYPE[self.ctype]
-
     def struct_format(self):
         return self.dtype.char
 
     def alignment_requirement(self):
-        import pyopencl._pvt_struct as _struct
-        return _struct.calcsize(self.struct_format())
+        return self.target.alignment_requirement(self)
 
     def default_value(self):
         return 0
@@ -297,7 +292,7 @@ class ImplementedDataInfo(Record):
     .. attribute:: allows_offset
     """
 
-    def __init__(self, name, dtype, cgen_declarator, arg_class,
+    def __init__(self, target, name, dtype, cgen_declarator, arg_class,
             base_name=None,
             shape=None, strides=None,
             unvec_shape=None, unvec_strides=None,
@@ -308,7 +303,7 @@ class ImplementedDataInfo(Record):
 
         Record.__init__(self,
                 name=name,
-                picklable_dtype=PicklableDtype(dtype),
+                picklable_dtype=PicklableDtype(dtype, target=target),
                 cgen_declarator=cgen_declarator,
                 arg_class=arg_class,
                 base_name=base_name,
@@ -338,6 +333,11 @@ code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
 # {{{ main code generation entrypoint
 
 def generate_code(kernel, device=None):
+    if device is not None:
+        from warnings import warn
+        warn("passing 'device' to generate_code() is deprecated",
+                DeprecationWarning, stacklevel=2)
+
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel)
@@ -351,14 +351,9 @@ def generate_code(kernel, device=None):
     from loopy import CACHING_ENABLED
 
     if CACHING_ENABLED:
-        if device is not None:
-            device_id = device.persistent_unique_id
-        else:
-            device_id = None
-
-        code_gen_cache_key = (kernel, device_id)
+        input_kernel = kernel
         try:
-            result = code_gen_cache[code_gen_cache_key]
+            result = code_gen_cache[input_kernel]
             logger.info("%s: code generation cache hit" % kernel.name)
             return result
         except KeyError:
@@ -370,7 +365,7 @@ def generate_code(kernel, device=None):
     kernel = infer_unknown_types(kernel, expect_completion=True)
 
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel, device=device)
+    pre_codegen_checks(kernel)
 
     from cgen import (FunctionBody, FunctionDeclaration,
             Value, Module, Block,
@@ -407,14 +402,16 @@ def generate_code(kernel, device=None):
         if isinstance(arg, ArrayBase):
             impl_arg_info.extend(
                     arg.decl_info(
+                        kernel.target,
                         is_written=arg.name in kernel.get_written_variables(),
                         index_dtype=kernel.index_dtype))
 
         elif isinstance(arg, ValueArg):
             impl_arg_info.append(ImplementedDataInfo(
+                target=kernel.target,
                 name=arg.name,
                 dtype=arg.dtype,
-                cgen_declarator=Const(POD(arg.dtype, arg.name)),
+                cgen_declarator=Const(POD(kernel.target, arg.dtype, arg.name)),
                 arg_class=ValueArg))
 
         else:
@@ -427,12 +424,11 @@ def generate_code(kernel, device=None):
 
     # }}}
 
-    from pyopencl.tools import dtype_to_ctype
     mod.extend([
         LiteralLines(r"""
         #define lid(N) ((%(idx_ctype)s) get_local_id(N))
         #define gid(N) ((%(idx_ctype)s) get_group_id(N))
-        """ % dict(idx_ctype=dtype_to_ctype(kernel.index_dtype))),
+        """ % dict(idx_ctype=kernel.target.dtype_to_typename(kernel.index_dtype))),
         Line()])
 
     # {{{ build lmem array declarators for temporary variables
@@ -441,6 +437,7 @@ def generate_code(kernel, device=None):
             idi.cgen_declarator
             for tv in six.itervalues(kernel.temporary_variables)
             for idi in tv.decl_info(
+                kernel.target,
                 is_written=True, index_dtype=kernel.index_dtype))
 
     # }}}
@@ -478,8 +475,11 @@ def generate_code(kernel, device=None):
         seen_dtypes.add(tv.dtype)
 
     preambles = kernel.preambles[:]
-    for prea_gen in kernel.preamble_generators:
-        preambles.extend(prea_gen(seen_dtypes, seen_functions))
+
+    preamble_generators = (kernel.preamble_generators
+            + kernel.target.preamble_generators())
+    for prea_gen in preamble_generators:
+        preambles.extend(prea_gen(kernel.target, seen_dtypes, seen_functions))
 
     seen_preamble_tags = set()
     dedup_preambles = []
@@ -506,7 +506,9 @@ def generate_code(kernel, device=None):
 
     result = result, impl_arg_info
 
-    code_gen_cache[code_gen_cache_key] = result
+    if CACHING_ENABLED:
+        code_gen_cache[input_kernel] = result
+
     return result
 
 # }}}
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index 1963c299571206a20ea0c48ab49b2923968e31bb..bbba284c5d1329ffbd951840fac060b53832e0dd 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -33,8 +33,6 @@ from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
         PREC_POWER)
 from pymbolic.mapper import CombineMapper
 import islpy as isl
-import pyopencl as cl
-import pyopencl.array  # noqa
 from pytools import Record
 
 from loopy.tools import is_integer
@@ -179,11 +177,10 @@ class TypeInferenceMapper(CombineMapper):
         if expr.name in self.kernel.all_inames():
             return self.kernel.index_dtype
 
-        for mangler in self.kernel.symbol_manglers:
-            result = mangler(expr.name)
-            if result is not None:
-                result_dtype, _ = result
-                return result_dtype
+        result = self.kernel.mangle_symbol(expr.name)
+        if result is not None:
+            result_dtype, _ = result
+            return result_dtype
 
         obj = self.new_assignments.get(expr.name)
 
@@ -238,7 +235,8 @@ class TypeInferenceMapper(CombineMapper):
     map_logical_or = map_comparison
 
     def map_reduction(self, expr):
-        return expr.operation.result_dtype(self.rec(expr.expr), expr.inames)
+        return expr.operation.result_dtype(
+                self.kernel.target, self.rec(expr.expr), expr.inames)
 
 # }}}
 
@@ -251,7 +249,7 @@ class TypeInferenceMapper(CombineMapper):
 # - 'd' for double-precision floating point
 # or None for 'no known context'.
 
-def dtype_to_type_context(dtype):
+def dtype_to_type_context(target, dtype):
     dtype = np.dtype(dtype)
 
     if dtype.kind == 'i':
@@ -260,8 +258,8 @@ def dtype_to_type_context(dtype):
         return 'd'
     if dtype in [np.float32, np.complex64]:
         return 'f'
-    if dtype in list(cl.array.vec.types.values()):
-        return dtype_to_type_context(dtype.fields["x"][0])
+    if target.is_vector_dtype(dtype):
+        return dtype_to_type_context(target, dtype.fields["x"][0])
 
     return None
 
@@ -402,11 +400,10 @@ class LoopyCCodeMapper(RecursiveMapper):
                     raise RuntimeError("unsubscripted reference to array '%s'"
                             % expr.name)
 
-        for mangler in self.kernel.symbol_manglers:
-            result = mangler(expr.name)
-            if result is not None:
-                _, c_name = result
-                return c_name
+        result = self.kernel.mangle_symbol(expr.name)
+        if result is not None:
+            _, c_name = result
+            return c_name
 
         return expr.name
 
@@ -447,7 +444,7 @@ class LoopyCCodeMapper(RecursiveMapper):
         from loopy.kernel.array import get_access_info
         from pymbolic import evaluate
 
-        access_info = get_access_info(ary, expr.index,
+        access_info = get_access_info(self.kernel.target, ary, expr.index,
                 lambda expr: evaluate(expr, self.var_subst_map))
 
         vec_member = get_opencl_vec_member(access_info.vector_index)
@@ -462,7 +459,7 @@ class LoopyCCodeMapper(RecursiveMapper):
 
             if ary.dtype == np.float32:
                 return base_access+".x"
-            if ary.dtype in cl.array.vec.type_to_scalar_and_count:
+            if self.kernel.target.is_vector_dtype(ary.dtype):
                 return base_access
             elif ary.dtype == np.float64:
                 return "as_double(%s.xy)" % base_access
@@ -667,7 +664,8 @@ class LoopyCCodeMapper(RecursiveMapper):
                 result_dtype, c_name, arg_tgt_dtypes = mangle_result
 
                 str_parameters = [
-                        self.rec(par, PREC_NONE, dtype_to_type_context(tgt_dtype),
+                        self.rec(par, PREC_NONE,
+                            dtype_to_type_context(self.kernel.target, tgt_dtype),
                             tgt_dtype)
                         for par, par_dtype, tgt_dtype in zip(
                             expr.parameters, par_dtypes, arg_tgt_dtypes)]
@@ -683,7 +681,8 @@ class LoopyCCodeMapper(RecursiveMapper):
             # not. Using the inferred type as a stopgap for now.
             str_parameters = [
                     self.rec(par, PREC_NONE,
-                        type_context=dtype_to_type_context(par_dtype))
+                        type_context=dtype_to_type_context(
+                            self.kernel.target, par_dtype))
                     for par, par_dtype in zip(expr.parameters, par_dtypes)]
 
         if c_name is None:
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index cc102f0c26834eceee2449cd74e52b94bc19a848..ca1cce69c4cf1f87233c35e3dfc7987940f132fa 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -99,7 +99,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     result = Assign(
             lhs_code,
             ccm(expr, prec=PREC_NONE,
-                type_context=dtype_to_type_context(target_dtype),
+                type_context=dtype_to_type_context(kernel.target, target_dtype),
                 needed_dtype=target_dtype))
 
     if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
@@ -174,7 +174,7 @@ def generate_c_instruction_code(kernel, insn, codegen_state):
 
         body.append(
                 Initializer(
-                    POD(kernel.index_dtype, name),
+                    POD(kernel.target, kernel.index_dtype, name),
                     codegen_state.c_code_mapper(
                         iname_expr, prec=PREC_NONE, type_context="i")))
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index f5e9c9f7fc87e24b47e193e6740a928f695cd3e2..40f433a90f90855aa554261b8d6fb5cbdd5fb0ec 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -369,11 +369,10 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
 
         else:
             from loopy.codegen import wrap_in
-            from pyopencl.tools import dtype_to_ctype
 
             result.append(wrap_in(For,
                     "%s %s = %s"
-                    % (dtype_to_ctype(kernel.index_dtype),
+                    % (kernel.target.dtype_to_typename(kernel.index_dtype),
                         loop_iname, ccm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                     "%s <= %s" % (
                         loop_iname, ccm(aff_to_expr(static_ubound), PREC_NONE, "i")),
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 14a2ed46a0bb93eeb4c7341702b7ab07ddb52208..7fb686e742c914e87ce07f3a9043308d7ff61c9f 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -384,7 +384,9 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
                         "be supplied\")" % arg.name)
                 gen("")
 
-        if is_written and arg.arg_class is lp.ImageArg and not options.skip_arg_checks:
+        if (is_written
+                and arg.arg_class is lp.ImageArg
+                and not options.skip_arg_checks):
             gen("if %s is None:" % arg.name)
             with Indentation(gen):
                 gen("raise RuntimeError(\"written image '%s' must "
@@ -683,7 +685,7 @@ class CompiledKernel:
 
         if kernel.schedule is None:
             from loopy.preprocess import preprocess_kernel
-            kernel = preprocess_kernel(kernel, self.context.devices[0])
+            kernel = preprocess_kernel(kernel)
 
             from loopy.schedule import get_one_scheduled_kernel
             kernel = get_one_scheduled_kernel(kernel)
@@ -695,7 +697,7 @@ class CompiledKernel:
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code
-        code, impl_arg_info = generate_code(kernel, device=self.context.devices[0])
+        code, impl_arg_info = generate_code(kernel)
 
         if self.kernel.options.write_cl:
             output = code
@@ -733,7 +735,7 @@ class CompiledKernel:
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
 
         from loopy.codegen import generate_code
-        code, arg_info = generate_code(kernel, device=self.context.devices[0])
+        code, arg_info = generate_code(kernel)
         return code
 
     def get_highlighted_code(self, arg_to_dtype=None):
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 754944cefb68a4f7bdbc2f2aa9f7de29268b66cd..f048f06b8c1087dc45f462b69bcf20596cdd7ee5 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -32,62 +32,6 @@ import islpy as isl
 from islpy import dim_type
 
 
-def block_shift_constraint(cns, type, pos, multiple, as_equality=None):
-    if as_equality != cns.is_equality():
-        if as_equality:
-            factory = isl.Constraint.equality_from_aff
-        else:
-            factory = isl.Constraint.inequality_from_aff
-
-        cns = factory(cns.get_aff())
-
-    cns = cns.set_constant(cns.get_constant()
-            + cns.get_coefficient(type, pos)*multiple)
-
-    return cns
-
-
-def negate_constraint(cns):
-    assert not cns.is_equality()
-    # FIXME hackety hack
-    my_set = (isl.BasicSet.universe(cns.get_space())
-            .add_constraint(cns))
-    my_set = my_set.complement()
-
-    results = []
-
-    def examine_basic_set(s):
-        s.foreach_constraint(results.append)
-
-    my_set.foreach_basic_set(examine_basic_set)
-    result, = results
-    return result
-
-
-def make_index_map(set, index_expr):
-    from loopy.symbolic import eq_constraint_from_expr
-
-    if not isinstance(index_expr, tuple):
-        index_expr = (index_expr,)
-
-    amap = isl.Map.from_domain(set).add_dims(dim_type.out, len(index_expr))
-    out_names = ["_ary_idx_%d" % i for i in range(len(index_expr))]
-
-    dim = amap.get_space()
-    all_constraints = tuple(
-            eq_constraint_from_expr(dim, iexpr_i)
-            for iexpr_i in index_expr)
-
-    for i, out_name in enumerate(out_names):
-        amap = amap.set_dim_name(dim_type.out, i, out_name)
-
-    for i, (out_name, constr) in enumerate(zip(out_names, all_constraints)):
-        constr.set_coefficients_by_name({out_name: -1})
-        amap = amap.add_constraint(constr)
-
-    return amap
-
-
 def pw_aff_to_aff(pw_aff):
     if isinstance(pw_aff, isl.Aff):
         return pw_aff
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 7dd67bc42077088f31bd219cd179709fce959661..db1af79ad110c9e195a3c82acc77ba12f82fe3c2 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -39,12 +39,8 @@ from pytools import UniqueNameGenerator, generate_unique_names
 
 from loopy.library.function import (
         default_function_mangler,
-        opencl_function_mangler,
         single_arg_function_mangler)
 
-from loopy.library.symbol import opencl_symbol_mangler
-from loopy.library.preamble import default_preamble_generator
-
 from loopy.diagnostic import CannotBranchDomainTree
 
 
@@ -146,6 +142,10 @@ class LoopKernel(RecordWithoutPickling):
     .. attribute:: state
 
         A value from :class:`kernel_state`.
+
+    .. attribute:: target
+
+        A subclass of :class:`loopy.target.TargetBase`.
     """
 
     # {{{ constructor
@@ -153,7 +153,7 @@ class LoopKernel(RecordWithoutPickling):
     def __init__(self, domains, instructions, args=[], schedule=None,
             name="loopy_kernel",
             preambles=[],
-            preamble_generators=[default_preamble_generator],
+            preamble_generators=[],
             assumptions=None,
             local_sizes={},
             temporary_variables={},
@@ -161,10 +161,9 @@ class LoopKernel(RecordWithoutPickling):
             substitutions={},
             function_manglers=[
                 default_function_mangler,
-                opencl_function_mangler,
                 single_arg_function_mangler,
                 ],
-            symbol_manglers=[opencl_symbol_mangler],
+            symbol_manglers=[],
 
             iname_slab_increments={},
             loop_priority=[],
@@ -176,6 +175,7 @@ class LoopKernel(RecordWithoutPickling):
             options=None,
 
             state=kernel_state.INITIAL,
+            target=None,
 
             # When kernels get intersected in slab decomposition,
             # their grid sizes shouldn't change. This provides
@@ -278,15 +278,18 @@ class LoopKernel(RecordWithoutPickling):
                 symbol_manglers=symbol_manglers,
                 index_dtype=index_dtype,
                 options=options,
-                state=state)
+                state=state,
+                target=target)
 
     # }}}
 
     # {{{ function mangling
 
     def mangle_function(self, identifier, arg_dtypes):
-        for mangler in self.function_manglers:
-            mangle_result = mangler(identifier, arg_dtypes)
+        manglers = self.target.function_manglers() + self.function_manglers
+
+        for mangler in manglers:
+            mangle_result = mangler(self.target, identifier, arg_dtypes)
             if mangle_result is not None:
                 return mangle_result
 
@@ -294,6 +297,20 @@ class LoopKernel(RecordWithoutPickling):
 
     # }}}
 
+    # {{{ symbol mangling
+
+    def mangle_symbol(self, identifier):
+        manglers = self.target.symbol_manglers() + self.symbol_manglers
+
+        for mangler in manglers:
+            result = mangler(self.target, identifier)
+            if result is not None:
+                return result
+
+        return None
+
+    # }}}
+
     # {{{ name wrangling
 
     @memoize_method
@@ -1085,6 +1102,7 @@ class LoopKernel(RecordWithoutPickling):
             "silenced_warnings",
             "options",
             "state",
+            "target",
             ]
 
     comparison_fields = hash_fields + [
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index a14451ba1d6f4f6843c006ff9ab982ec7ebb302e..5503437555f3f81e2856cf512238d7c59589a29f 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -31,9 +31,6 @@ from six import iteritems
 
 from pytools import Record, memoize_method
 
-import pyopencl as cl  # noqa
-import pyopencl.array  # noqa
-
 import numpy as np  # noqa
 
 from loopy.diagnostic import LoopyError
@@ -244,6 +241,8 @@ def _parse_array_dim_tag(tag, default_target_axis, nesting_levels):
         padded_stride_match = PADDED_STRIDE_TAG_RE.match(tag)
         if padded_stride_match is not None:
             tag = padded_stride_match.group(1)
+
+            from loopy.symbolic import parse
             pad_to = parse(padded_stride_match.group(2))
         else:
             pad_to = None
@@ -592,7 +591,10 @@ class ArrayBase(Record):
 
         if dtype is not None and dtype is not lp.auto:
             from loopy.tools import PicklableDtype
-            picklable_dtype = PicklableDtype(dtype)
+            if not isinstance(dtype, PicklableDtype):
+                picklable_dtype = PicklableDtype(dtype)
+            else:
+                picklable_dtype = dtype
 
             if picklable_dtype.dtype == object:
                 raise TypeError("loopy does not directly support object arrays "
@@ -826,7 +828,7 @@ class ArrayBase(Record):
 
         return self.copy(**kwargs)
 
-    def vector_size(self):
+    def vector_size(self, target):
         """Return the size of the vector type used for the array
         divided by the basic data type.
 
@@ -844,13 +846,13 @@ class ArrayBase(Record):
                             "length for vector axis %d (0-based)" % (
                                 self.name, i))
 
-                vec_dtype = cl.array.vec.types[self.dtype, shape_i]
+                vec_dtype = target.vector_dtype(self.dtype, shape_i)
 
                 return int(vec_dtype.itemsize) // int(self.dtype.itemsize)
 
         return 1
 
-    def decl_info(self, is_written, index_dtype):
+    def decl_info(self, target, is_written, index_dtype):
         """Return a list of :class:`loopy.codegen.ImplementedDataInfo`
         instances corresponding to the argume
         """
@@ -907,6 +909,7 @@ class ArrayBase(Record):
 
                     stride_args.append(
                             ImplementedDataInfo(
+                                target=target,
                                 name=stride_name,
                                 dtype=index_dtype,
                                 cgen_declarator=Const(POD(index_dtype, stride_name)),
@@ -915,12 +918,13 @@ class ArrayBase(Record):
                                     full_name, stride_impl_axis)))
 
                 yield ImplementedDataInfo(
+                            target=target,
                             name=full_name,
                             base_name=self.name,
 
                             # implemented by various argument types
                             cgen_declarator=self.get_arg_decl(
-                                name_suffix, shape, dtype, is_written),
+                                target, name_suffix, shape, dtype, is_written),
 
                             arg_class=type(self),
                             dtype=dtype,
@@ -935,6 +939,7 @@ class ArrayBase(Record):
                     from cgen import Const, POD
                     offset_name = full_name+"_offset"
                     yield ImplementedDataInfo(
+                                target=target,
                                 name=offset_name,
                                 dtype=index_dtype,
                                 cgen_declarator=Const(POD(index_dtype, offset_name)),
@@ -1003,7 +1008,7 @@ class ArrayBase(Record):
                         # vectors always have stride 1
                         unvec_strides + (1,),
                         stride_arg_axes,
-                        cl.array.vec.types[dtype, shape_i],
+                        target.vector_dtype(dtype, shape_i),
                         user_index + (None,)):
                     yield res
 
@@ -1067,7 +1072,7 @@ class AccessInfo(Record):
     """
 
 
-def get_access_info(ary, index, eval_expr):
+def get_access_info(target, ary, index, eval_expr):
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
@@ -1109,7 +1114,7 @@ def get_access_info(ary, index, eval_expr):
     vector_index = None
     subscripts = [0] * num_target_axes
 
-    vector_size = ary.vector_size()
+    vector_size = ary.vector_size(target)
 
     # {{{ process separate-array dim tags first, to find array name
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 0727000f9d1b5cda84359c0d7ac3e49e5694f028..426fd3c537ee7635d661980bd3655031387b3568 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1043,6 +1043,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         to silence
     :arg options: an instance of :class:`loopy.Options` or an equivalent
         string representation
+    :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*,
+        to use an OpenCL target.
     """
 
     defines = kwargs.pop("defines", {})
@@ -1051,6 +1053,17 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     silenced_warnings = kwargs.pop("silenced_warnings", [])
     options = kwargs.pop("options", None)
     flags = kwargs.pop("flags", None)
+    target = kwargs.pop("target", None)
+
+    if target is None:
+        try:
+            import pyopencl  # noqa
+        except ImportError:
+            from loopy.target.opencl import OpenCLTarget
+            target = OpenCLTarget()
+        else:
+            from loopy.target.pyopencl import PyOpenCLTarget
+            target = PyOpenCLTarget()
 
     if flags is not None:
         if options is not None:
@@ -1153,6 +1166,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
             temporary_variables=temporary_variables,
             silenced_warnings=silenced_warnings,
             options=options,
+            target=target,
             **kwargs)
 
     from loopy import duplicate_inames
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index a563b9ef39560dacfa77943e037d4a25b2213c15..eae4008b8b515ad4e82c3a140f53866d4c8de50e 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -175,10 +175,9 @@ def parse_tag(tag):
 
 # {{{ arguments
 
-
 class KernelArgument(Record):
     def __init__(self, **kwargs):
-        dtype = kwargs.pop("dtype")
+        dtype = kwargs.pop("dtype", None)
 
         if isinstance(dtype, np.dtype):
             from loopy.tools import PicklableDtype
@@ -210,13 +209,13 @@ class GlobalArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
         from cgen.opencl import CLGlobal
 
         arg_decl = RestrictPointer(
-                POD(dtype, self.name + name_suffix))
+                POD(target, dtype, self.name + name_suffix))
 
         if not is_written:
             arg_decl = Const(arg_decl)
@@ -228,7 +227,7 @@ class ConstantArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
-    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
         from cgen.opencl import CLConstant
@@ -250,7 +249,7 @@ class ImageArg(ArrayBase, KernelArgument):
     def dimensions(self):
         return len(self.dim_tags)
 
-    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
         if is_written:
             mode = "w"
         else:
@@ -262,7 +261,8 @@ class ImageArg(ArrayBase, KernelArgument):
 
 class ValueArg(KernelArgument):
     def __init__(self, name, dtype=None, approximately=1000):
-        if dtype is not None:
+        from loopy.tools import PicklableDtype
+        if dtype is not None and not isinstance(dtype, PicklableDtype):
             dtype = np.dtype(dtype)
 
         KernelArgument.__init__(self, name=name, dtype=dtype,
@@ -342,12 +342,12 @@ class TemporaryVariable(ArrayBase):
         from pytools import product
         return product(si for si in self.shape)*self.dtype.itemsize
 
-    def get_arg_decl(self, name_suffix, shape, dtype, is_written):
+    def get_arg_decl(self, target, name_suffix, shape, dtype, is_written):
         from cgen import ArrayOf
         from loopy.codegen import POD  # uses the correct complex type
         from cgen.opencl import CLLocal
 
-        temp_var_decl = POD(self.dtype, self.name)
+        temp_var_decl = POD(target, self.dtype, self.name)
 
         # FIXME take into account storage_shape, or something like it
         storage_shape = self.shape
diff --git a/loopy/library/function.py b/loopy/library/function.py
index fcae07b6b599761e4081c40f1eaad54ca9bcb2c6..e494169bbe5b83df852ea3d483ed3640381891f6 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -23,64 +23,19 @@ THE SOFTWARE.
 """
 
 
-import numpy as np
-
-
-def default_function_mangler(name, arg_dtypes):
+def default_function_mangler(target, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
 
     manglers = [reduction_function_mangler]
     for mangler in manglers:
-        result = mangler(name, arg_dtypes)
+        result = mangler(target, name, arg_dtypes)
         if result is not None:
             return result
 
     return None
 
 
-def opencl_function_mangler(name, arg_dtypes):
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type([], arg_dtypes)
-
-        if dtype.kind == "c":
-            raise RuntimeError("min/max do not support complex numbers")
-
-        if dtype.kind == "f":
-            name = "f" + name
-
-        return dtype, name
-
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
-
-    if len(arg_dtypes) == 1:
-        arg_dtype, = arg_dtypes
-
-        if arg_dtype.kind == "c":
-            if arg_dtype == np.complex64:
-                tpname = "cfloat"
-            elif arg_dtype == np.complex128:
-                tpname = "cdouble"
-            else:
-                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
-
-            if name in ["sqrt", "exp", "log",
-                    "sin", "cos", "tan",
-                    "sinh", "cosh", "tanh",
-                    "conj"]:
-                return arg_dtype, "%s_%s" % (tpname, name)
-
-            if name in ["real", "imag", "abs"]:
-                return np.dtype(arg_dtype.type(0).real), "%s_%s" % (tpname, name)
-
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["s0"]
-        return scalar_dtype, name
-
-    return None
-
-
-def single_arg_function_mangler(name, arg_dtypes):
+def single_arg_function_mangler(target, name, arg_dtypes):
     if len(arg_dtypes) == 1:
         dtype, = arg_dtypes
         return dtype, name
diff --git a/loopy/library/preamble.py b/loopy/library/preamble.py
deleted file mode 100644
index 86c1fecec1019e9b0b5938464c27f3c07932211c..0000000000000000000000000000000000000000
--- a/loopy/library/preamble.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-import numpy as np
-
-
-def default_preamble_generator(seen_dtypes, seen_functions):
-    from loopy.library.reduction import reduction_preamble_generator
-
-    for result in reduction_preamble_generator(seen_dtypes, seen_functions):
-        yield result
-
-    has_double = False
-    has_complex = False
-
-    for dtype in seen_dtypes:
-        if dtype in [np.float64, np.complex128]:
-            has_double = True
-        if dtype.kind == "c":
-            has_complex = True
-
-    if has_double:
-        yield ("00_enable_double", """
-            #if __OPENCL_C_VERSION__ < 120
-            #pragma OPENCL EXTENSION cl_khr_fp64: enable
-            #endif
-            """)
-
-    if has_complex:
-        if has_double:
-            yield ("10_include_complex_header", """
-                #define PYOPENCL_DEFINE_CDOUBLE
-
-                #include <pyopencl-complex.h>
-                """)
-        else:
-            yield ("10_include_complex_header", """
-                #include <pyopencl-complex.h>
-                """)
-
-    c_funcs = set(func.c_name for func in seen_functions)
-    if "int_floor_div" in c_funcs:
-        yield ("05_int_floor_div", """
-            #define int_floor_div(a,b) \
-              (( (a) - \
-                 ( ( (a)<0 ) != ( (b)<0 )) \
-                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
-               / (b) )
-            """)
-
-    if "int_floor_div_pos_b" in c_funcs:
-        yield ("05_int_floor_div_pos_b", """
-            #define int_floor_div_pos_b(a,b) ( \
-                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
-                )
-            """)
-
-# vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 650d1e45218ff21a8140288cb3e84762b98046ef..9a4330658515c0eccfc6566d1a478adb6afbecf2 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -27,6 +27,7 @@ from pymbolic import var
 import numpy as np
 
 from loopy.symbolic import FunctionIdentifier
+from loopy.diagnostic import LoopyError
 
 
 class ReductionOperation(object):
@@ -34,7 +35,7 @@ class ReductionOperation(object):
     equality-comparable.
     """
 
-    def result_dtype(self, arg_dtype, inames):
+    def result_dtype(self, target, arg_dtype, inames):
         raise NotImplementedError
 
     def neutral_element(self, dtype, inames):
@@ -54,32 +55,50 @@ class ReductionOperation(object):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    @staticmethod
+    def parse_result_type(target, op_type):
+        try:
+            return np.dtype(op_type)
+        except TypeError:
+            pass
+
+        if op_type.startswith("vec_"):
+            try:
+                return target.get_or_register_dtype(op_type[4:])
+            except AttributeError:
+                pass
+
+        raise LoopyError("unable to parse reduction type: '%s'"
+                % op_type)
+
 
 class ScalarReductionOperation(ReductionOperation):
-    def __init__(self, forced_result_dtype=None):
+    def __init__(self, forced_result_type=None):
         """
-        :arg forced_result_dtype: Force the reduction result to be of this type.
+        :arg forced_result_type: Force the reduction result to be of this type.
+            May be a string identifying the type for the backend under
+            consideration.
         """
-        self.forced_result_dtype = forced_result_dtype
+        self.forced_result_type = forced_result_type
 
-    def result_dtype(self, arg_dtype, inames):
-        if self.forced_result_dtype is not None:
-            return self.forced_result_dtype
+    def result_dtype(self, target, arg_dtype, inames):
+        if self.forced_result_type is not None:
+            return self.parse_result_type(target, self.forced_result_type)
 
         return arg_dtype
 
     def __hash__(self):
-        return hash((type(self), self.forced_result_dtype))
+        return hash((type(self), self.forced_result_type))
 
     def __eq__(self, other):
         return (type(self) == type(other)
-                and self.forced_result_dtype == other.forced_result_dtype)
+                and self.forced_result_type == other.forced_result_type)
 
     def __str__(self):
         result = type(self).__name__.replace("ReductionOperation", "").lower()
 
-        if self.forced_result_dtype is not None:
-            result = "%s<%s>" % (result, str(self.forced_result_dtype))
+        if self.forced_result_type is not None:
+            result = "%s<%s>" % (result, str(self.forced_result_type))
 
         return result
 
@@ -136,15 +155,14 @@ class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, dtype):
         return "loopy_arg%s_%s" % (self.which, dtype.type.__name__)
 
-    def result_dtype(self, dtype, inames):
+    def result_dtype(self, target, dtype, inames):
         try:
             return ARGEXT_STRUCT_DTYPES[dtype]
         except KeyError:
             struct_dtype = np.dtype([("value", dtype), ("index", np.int32)])
             ARGEXT_STRUCT_DTYPES[dtype] = struct_dtype
 
-            from pyopencl.tools import get_or_register_dtype
-            get_or_register_dtype(self.prefix(dtype)+"_result", struct_dtype)
+            target.get_or_register_dtype(self.prefix(dtype)+"_result", struct_dtype)
             return struct_dtype
 
     def neutral_element(self, dtype, inames):
@@ -188,11 +206,10 @@ class ArgExtFunction(FunctionIdentifier):
         return (self.reduction_op, self.scalar_dtype, self.name, self.inames)
 
 
-def get_argext_preamble(func_id):
+def get_argext_preamble(target, func_id):
     op = func_id.reduction_op
     prefix = op.prefix(func_id.scalar_dtype)
 
-    from pyopencl.tools import dtype_to_ctype
     from pymbolic.mapper.c_code import CCodeMapper
 
     c_code_mapper = CCodeMapper()
@@ -225,7 +242,7 @@ def get_argext_preamble(func_id):
     }
     """ % dict(
             type_name=prefix+"_result",
-            scalar_type=dtype_to_ctype(func_id.scalar_dtype),
+            scalar_type=target.dtype_to_typename(func_id.scalar_dtype),
             prefix=prefix,
             neutral=c_code_mapper(
                 op.neutral_sign*get_le_neutral(func_id.scalar_dtype)),
@@ -267,20 +284,8 @@ def parse_reduction_op(name):
         op_name = red_op_match.group(1)
         op_type = red_op_match.group(2)
 
-        try:
-            op_dtype = np.dtype(op_type)
-        except TypeError:
-            op_dtype = None
-
-        if op_dtype is None and op_type.startswith("vec_"):
-            import pyopencl.array as cl_array
-            try:
-                op_dtype = getattr(cl_array.vec, op_type[4:])
-            except AttributeError:
-                op_dtype = None
-
-        if op_name in _REDUCTION_OPS and op_dtype is not None:
-            return _REDUCTION_OPS[op_name](op_dtype)
+        if op_name in _REDUCTION_OPS:
+            return _REDUCTION_OPS[op_name](op_type)
 
     if name in _REDUCTION_OPS:
         return _REDUCTION_OPS[name]()
@@ -295,18 +300,27 @@ def parse_reduction_op(name):
 # }}}
 
 
-def reduction_function_mangler(func_id, arg_dtypes):
+def reduction_function_mangler(target, func_id, arg_dtypes):
     if isinstance(func_id, ArgExtFunction):
+        from loopy.target.opencl import OpenCLTarget
+        if not isinstance(target, OpenCLTarget):
+            raise LoopyError("only OpenCL supported for now")
+
         op = func_id.reduction_op
-        return (op.result_dtype(func_id.scalar_dtype, func_id.inames),
+        return (op.result_dtype(target, func_id.scalar_dtype, func_id.inames),
                 "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name))
 
     return None
 
 
-def reduction_preamble_generator(seen_dtypes, seen_functions):
+def reduction_preamble_generator(target, seen_dtypes, seen_functions):
+    from loopy.target.opencl import OpenCLTarget
+
     for func in seen_functions:
         if isinstance(func.name, ArgExtFunction):
-            yield get_argext_preamble(func.name)
+            if not isinstance(target, OpenCLTarget):
+                raise LoopyError("only OpenCL supported for now")
+
+            yield get_argext_preamble(target, func.name)
 
 # vim: fdm=marker
diff --git a/loopy/library/symbol.py b/loopy/library/symbol.py
deleted file mode 100644
index 4ad22d58cebcff297a21509d43ba226eb4b1da8e..0000000000000000000000000000000000000000
--- a/loopy/library/symbol.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from __future__ import division
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-import numpy as np
-
-
-def opencl_symbol_mangler(name):
-    # FIXME: should be more picky about exact names
-    if name.startswith("FLT_"):
-        return np.dtype(np.float32), name
-    elif name.startswith("DBL_"):
-        return np.dtype(np.float64), name
-    elif name.startswith("M_"):
-        if name.endswith("_F"):
-            return np.dtype(np.float32), name
-        else:
-            return np.dtype(np.float64), name
-    elif name == "INFINITY":
-        return np.dtype(np.float32), name
-    else:
-        return None
-
-# vim: foldmethod=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index d4275ea6a1114e9994e11fb00f008ff4ef584f4d..af279fc4e56d3e6106231db223750555714abed5 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -24,10 +24,7 @@ THE SOFTWARE.
 
 
 import six
-from six.moves import range
 import numpy as np
-import pyopencl as cl
-import pyopencl.characterize as cl_char
 from loopy.diagnostic import (
         LoopyError, LoopyWarning, WriteRaceConditionWarning, warn,
         LoopyAdvisory)
@@ -40,6 +37,36 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+# {{{ propagate target info
+
+def propagate_target(kernel):
+    import loopy as lp
+    new_args = []
+
+    for arg in kernel.args:
+        dtype = arg.picklable_dtype
+        if dtype is not None and dtype is not lp.auto:
+            dtype = dtype.with_target(kernel.target)
+
+        new_args.append(arg.copy(dtype=dtype))
+
+    new_temporary_variables = {}
+    for name, temp in six.iteritems(kernel.temporary_variables):
+        dtype = temp.picklable_dtype
+        if dtype is not None and dtype is not lp.auto:
+            dtype = dtype.with_target(kernel.target)
+
+        new_temporary_variables[name] = temp.copy(dtype=dtype)
+
+    kernel = kernel.copy(
+            args=new_args,
+            temporary_variables=new_temporary_variables)
+
+    return kernel
+
+# }}}
+
+
 # {{{ infer types
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
@@ -402,7 +429,8 @@ def realize_reduction(kernel, insn_id_filter=None):
         new_temporary_variables[target_var_name] = TemporaryVariable(
                 name=target_var_name,
                 shape=(),
-                dtype=expr.operation.result_dtype(arg_dtype, expr.inames),
+                dtype=expr.operation.result_dtype(
+                    kernel.target, arg_dtype, expr.inames),
                 is_local=False)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -992,90 +1020,16 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 # }}}
 
 
-# {{{ temp storage adjust for bank conflict
-
-def adjust_local_temp_var_storage(kernel, device):
-    logger.debug("%s: adjust temp var storage" % kernel.name)
-
-    new_temp_vars = {}
-
-    lmem_size = cl_char.usable_local_mem_size(device)
-    for temp_var in six.itervalues(kernel.temporary_variables):
-        if not temp_var.is_local:
-            new_temp_vars[temp_var.name] = \
-                    temp_var.copy(storage_shape=temp_var.shape)
-            continue
-
-        other_loctemp_nbytes = [
-                tv.nbytes
-                for tv in six.itervalues(kernel.temporary_variables)
-                if tv.is_local and tv.name != temp_var.name]
-
-        storage_shape = temp_var.storage_shape
-
-        if storage_shape is None:
-            storage_shape = temp_var.shape
-
-        storage_shape = list(storage_shape)
-
-        # sizes of all dims except the last one, which we may change
-        # below to avoid bank conflicts
-        from pytools import product
-
-        if device.local_mem_type == cl.device_local_mem_type.GLOBAL:
-            # FIXME: could try to avoid cache associativity disasters
-            new_storage_shape = storage_shape
-
-        elif device.local_mem_type == cl.device_local_mem_type.LOCAL:
-            min_mult = cl_char.local_memory_bank_count(device)
-            good_incr = None
-            new_storage_shape = storage_shape
-            min_why_not = None
-
-            for increment in range(storage_shape[-1]//2):
-
-                test_storage_shape = storage_shape[:]
-                test_storage_shape[-1] = test_storage_shape[-1] + increment
-                new_mult, why_not = cl_char.why_not_local_access_conflict_free(
-                        device, temp_var.dtype.itemsize,
-                        temp_var.shape, test_storage_shape)
-
-                # will choose smallest increment 'automatically'
-                if new_mult < min_mult:
-                    new_lmem_use = (sum(other_loctemp_nbytes)
-                            + temp_var.dtype.itemsize*product(test_storage_shape))
-                    if new_lmem_use < lmem_size:
-                        new_storage_shape = test_storage_shape
-                        min_mult = new_mult
-                        min_why_not = why_not
-                        good_incr = increment
-
-            if min_mult != 1:
-                from warnings import warn
-                from loopy.diagnostic import LoopyAdvisory
-                warn("could not find a conflict-free mem layout "
-                        "for local variable '%s' "
-                        "(currently: %dx conflict, increment: %s, reason: %s)"
-                        % (temp_var.name, min_mult, good_incr, min_why_not),
-                        LoopyAdvisory)
-        else:
-            from warnings import warn
-            warn("unknown type of local memory")
-
-            new_storage_shape = storage_shape
-
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
-
-    return kernel.copy(temporary_variables=new_temp_vars)
-
-# }}}
-
-
 preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
 def preprocess_kernel(kernel, device=None):
+    if device is not None:
+        from warnings import warn
+        warn("passing 'device' to preprocess_kernel() is deprecated",
+                DeprecationWarning, stacklevel=2)
+
     from loopy.kernel import kernel_state
     if kernel.state != kernel_state.INITIAL:
         raise LoopyError("cannot re-preprocess an already preprocessed "
@@ -1085,14 +1039,10 @@ def preprocess_kernel(kernel, device=None):
 
     from loopy import CACHING_ENABLED
     if CACHING_ENABLED:
-        if device is not None:
-            device_id = device.persistent_unique_id
-        else:
-            device_id = None
+        input_kernel = kernel
 
-        pp_cache_key = (kernel, device_id)
         try:
-            result = preprocess_cache[pp_cache_key]
+            result = preprocess_cache[kernel]
             logger.info("%s: preprocess cache hit" % kernel.name)
             return result
         except KeyError:
@@ -1134,20 +1084,29 @@ def preprocess_kernel(kernel, device=None):
     kernel = find_boostability(kernel)
     kernel = limit_boostability(kernel)
 
-    if device is not None:
-        kernel = adjust_local_temp_var_storage(kernel, device)
-    else:
-        from loopy.diagnostic import warn
-        warn(kernel, "no_device_in_preprocess",
-                "no device parameter was passed to loopy.preprocess")
+    kernel = kernel.target.preprocess(kernel)
 
     logger.info("%s: preprocess done" % kernel.name)
 
     kernel = kernel.copy(
             state=kernel_state.PREPROCESSED)
 
+    # {{{ propagate target info
+
+    # PicklableDtype instances for example need to know the target they're working
+    # towards in order to pickle and unpickle them. This is the first pass that
+    # uses caching, so we need to be ready to pickle. This means propagating
+    # this target information.
+
+    if CACHING_ENABLED:
+        input_kernel = propagate_target(input_kernel)
+
+    kernel = propagate_target(kernel)
+
+    # }}}
+
     if CACHING_ENABLED:
-        preprocess_cache[pp_cache_key] = kernel
+        preprocess_cache[input_kernel] = kernel
 
     return kernel
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 3b3eba49e8cec0d62fd2a57bcb19d795dcb40167..10acada145a3b65cd3232dcf06d0657672c5f347 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -243,10 +243,8 @@ class Reduction(AlgebraicLeaf):
 
     def __init__(self, operation, inames, expr):
         assert isinstance(inames, tuple)
-
-        if isinstance(operation, str):
-            from loopy.library.reduction import parse_reduction_op
-            operation = parse_reduction_op(operation)
+        from loopy.library.reduction import ReductionOperation
+        assert isinstance(operation, ReductionOperation)
 
         self.operation = operation
         self.inames = inames
@@ -620,7 +618,27 @@ class FunctionToPrimitiveMapper(IdentityMapper):
     turns those into the actual pymbolic primitives used for that.
     """
 
+    def _parse_reduction(self, operation, inames, red_expr):
+        if isinstance(inames, Variable):
+            inames = (inames,)
+
+        if not isinstance(inames, (tuple)):
+            raise TypeError("iname argument to reduce() must be a symbol "
+                    "or a tuple of symbols")
+
+        processed_inames = []
+        for iname in inames:
+            if not isinstance(iname, Variable):
+                raise TypeError("iname argument to reduce() must be a symbol "
+                        "or a tuple or a tuple of symbols")
+
+            processed_inames.append(iname.name)
+
+        return Reduction(operation, tuple(processed_inames), red_expr)
+
     def map_call(self, expr):
+        from loopy.library.reduction import parse_reduction_op
+
         from pymbolic.primitives import Variable
         if not isinstance(expr.function, Variable):
             return IdentityMapper.map_call(self, expr)
@@ -644,51 +662,38 @@ class FunctionToPrimitiveMapper(IdentityMapper):
         elif name == "reduce":
             if len(expr.parameters) == 3:
                 operation, inames, red_expr = expr.parameters
+
+                if not isinstance(operation, Variable):
+                    raise TypeError("operation argument to reduce() "
+                            "must be a symbol")
+
+                operation = parse_reduction_op(operation.name)
+                return self._parse_reduction(operation, inames, self.rec(red_expr))
             else:
                 raise TypeError("invalid 'reduce' calling sequence")
 
         elif name == "if":
-            if len(expr.parameters) in [2, 3]:
+            if len(expr.parameters) == 3:
                 from pymbolic.primitives import If
                 return If(*expr.parameters)
             else:
-                raise TypeError("if takes two or three arguments")
+                raise TypeError("if takes three arguments")
 
         else:
             # see if 'name' is an existing reduction op
 
-            from loopy.library.reduction import parse_reduction_op
-            if parse_reduction_op(name):
+            operation = parse_reduction_op(name)
+            if operation:
                 if len(expr.parameters) != 2:
                     raise RuntimeError("invalid invocation of "
                             "reduction operation '%s'" % expr.function.name)
 
-                operation = expr.function
                 inames, red_expr = expr.parameters
+                return self._parse_reduction(operation, inames, self.rec(red_expr))
+
             else:
                 return IdentityMapper.map_call(self, expr)
 
-        red_expr = self.rec(red_expr)
-
-        if not isinstance(operation, Variable):
-            raise TypeError("operation argument to reduce() must be a symbol")
-        operation = operation.name
-        if isinstance(inames, Variable):
-            inames = (inames,)
-
-        if not isinstance(inames, (tuple)):
-            raise TypeError("iname argument to reduce() must be a symbol "
-                    "or a tuple of symbols")
-
-        processed_inames = []
-        for iname in inames:
-            if not isinstance(iname, Variable):
-                raise TypeError("iname argument to reduce() must be a symbol "
-                        "or a tuple or a tuple of symbols")
-
-            processed_inames.append(iname.name)
-
-        return Reduction(operation, tuple(processed_inames), red_expr)
 
 # {{{ customization to pymbolic parser
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36c39044cb4448d59e5dae78f5cca13afb06d11
--- /dev/null
+++ b/loopy/target/__init__.py
@@ -0,0 +1,84 @@
+"""Base target interface."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+class TargetBase(object):
+    """Objects of this type must be picklable."""
+
+    # {{{ persistent hashing
+
+    hash_fields = []
+    comparison_fields = []
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        for field_name in self.hash_fields:
+            key_builder.rec(key_hash, getattr(self, field_name))
+
+    def __eq__(self, other):
+        if type(self) != type(other):
+            return False
+
+        for field_name in self.comparison_fields:
+            if getattr(self, field_name) != getattr(other, field_name):
+                return False
+
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    # }}}
+
+    def preprocess(self, kernel):
+        return kernel
+
+    def pre_codegen_check(self, kernel):
+        pass
+
+    def function_manglers(self):
+        return []
+
+    def symbol_manglers(self):
+        return []
+
+    def preamble_generators(self):
+        return []
+
+    def get_or_register_dtype(self, names, dtype=None):
+        raise NotImplementedError()
+
+    def dtype_to_typename(self, dtype):
+        raise NotImplementedError()
+
+    def is_vector_dtype(self, dtype):
+        raise NotImplementedError()
+
+    def vector_dtype(self, base, count):
+        raise NotImplementedError()
+
+    def alignment_requirement(self, type_decl):
+        import struct
+        return struct.calcsize(type_decl.struct_format())
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04efdedabb6e281b99c8ffae3a5be53e084524ed
--- /dev/null
+++ b/loopy/target/opencl/__init__.py
@@ -0,0 +1,274 @@
+"""OpenCL target independent of PyOpenCL."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+
+from loopy.target import TargetBase
+
+
+# {{{ type registry
+
+def _register_types():
+    from loopy.target.opencl.compyte.dtypes import (
+            _fill_dtype_registry, get_or_register_dtype)
+    import struct
+
+    _fill_dtype_registry(respect_windows=False, include_bool=False)
+
+    # complex number support left out
+
+    is_64_bit = struct.calcsize('@P') * 8 == 64
+    if not is_64_bit:
+        get_or_register_dtype(
+                ["unsigned long", "unsigned long int"], np.uint64)
+        get_or_register_dtype(
+                ["signed long", "signed long int", "long int"], np.int64)
+
+_register_types()
+
+# }}}
+
+
+# {{{ vector types
+
+class vec:
+    pass
+
+
+def _create_vector_types():
+    field_names = ["x", "y", "z", "w"]
+
+    from loopy.target.opencl.compyte.dtypes import get_or_register_dtype
+
+    vec.types = {}
+    vec.type_to_scalar_and_count = {}
+
+    counts = [2, 3, 4, 8, 16]
+
+    for base_name, base_type in [
+            ('char', np.int8),
+            ('uchar', np.uint8),
+            ('short', np.int16),
+            ('ushort', np.uint16),
+            ('int', np.int32),
+            ('uint', np.uint32),
+            ('long', np.int64),
+            ('ulong', np.uint64),
+            ('float', np.float32),
+            ('double', np.float64),
+            ]:
+        for count in counts:
+            name = "%s%d" % (base_name, count)
+
+            titles = field_names[:count]
+
+            padded_count = count
+            if count == 3:
+                padded_count = 4
+
+            names = ["s%d" % i for i in range(count)]
+            while len(names) < padded_count:
+                names.append("padding%d" % (len(names)-count))
+
+            if len(titles) < len(names):
+                titles.extend((len(names)-len(titles))*[None])
+
+            try:
+                dtype = np.dtype(dict(
+                    names=names,
+                    formats=[base_type]*padded_count,
+                    titles=titles))
+            except NotImplementedError:
+                try:
+                    dtype = np.dtype([((n, title), base_type)
+                                      for (n, title) in zip(names, titles)])
+                except TypeError:
+                    dtype = np.dtype([(n, base_type) for (n, title)
+                                      in zip(names, titles)])
+
+            get_or_register_dtype(name, dtype)
+
+            setattr(vec, name, dtype)
+
+            def create_array(dtype, count, padded_count, *args, **kwargs):
+                if len(args) < count:
+                    from warnings import warn
+                    warn("default values for make_xxx are deprecated;"
+                            " instead specify all parameters or use"
+                            " array.vec.zeros_xxx", DeprecationWarning)
+                padded_args = tuple(list(args)+[0]*(padded_count-len(args)))
+                array = eval("array(padded_args, dtype=dtype)",
+                        dict(array=np.array, padded_args=padded_args,
+                        dtype=dtype))
+                for key, val in kwargs.items():
+                    array[key] = val
+                return array
+
+            setattr(vec, "make_"+name, staticmethod(eval(
+                    "lambda *args, **kwargs: create_array(dtype, %i, %i, "
+                    "*args, **kwargs)" % (count, padded_count),
+                    dict(create_array=create_array, dtype=dtype))))
+            setattr(vec, "filled_"+name, staticmethod(eval(
+                    "lambda val: vec.make_%s(*[val]*%i)" % (name, count))))
+            setattr(vec, "zeros_"+name,
+                    staticmethod(eval("lambda: vec.filled_%s(0)" % (name))))
+            setattr(vec, "ones_"+name,
+                    staticmethod(eval("lambda: vec.filled_%s(1)" % (name))))
+
+            vec.types[np.dtype(base_type), count] = dtype
+            vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
+
+_create_vector_types()
+
+# }}}
+
+
+# {{{ function mangler
+
+def opencl_function_mangler(target, name, arg_dtypes):
+    if not isinstance(name, str):
+        return None
+
+    if name in ["max", "min"] and len(arg_dtypes) == 2:
+        dtype = np.find_common_type([], arg_dtypes)
+
+        if dtype.kind == "c":
+            raise RuntimeError("min/max do not support complex numbers")
+
+        if dtype.kind == "f":
+            name = "f" + name
+
+        return dtype, name
+
+    if name in "atan2" and len(arg_dtypes) == 2:
+        return arg_dtypes[0], name
+
+    if name == "dot":
+        scalar_dtype, offset, field_name = arg_dtypes[0].fields["s0"]
+        return scalar_dtype, name
+
+    return None
+
+# }}}
+
+
+# {{{ symbol mangler
+
+def opencl_symbol_mangler(target, name):
+    # FIXME: should be more picky about exact names
+    if name.startswith("FLT_"):
+        return np.dtype(np.float32), name
+    elif name.startswith("DBL_"):
+        return np.dtype(np.float64), name
+    elif name.startswith("M_"):
+        if name.endswith("_F"):
+            return np.dtype(np.float32), name
+        else:
+            return np.dtype(np.float64), name
+    elif name == "INFINITY":
+        return np.dtype(np.float32), name
+    else:
+        return None
+
+# }}}
+
+
+# {{{ preamble generator
+
+def opencl_preamble_generator(target, seen_dtypes, seen_functions):
+    has_double = False
+
+    for dtype in seen_dtypes:
+        if dtype in [np.float64, np.complex128]:
+            has_double = True
+
+    if has_double:
+        yield ("00_enable_double", """
+            #if __OPENCL_C_VERSION__ < 120
+            #pragma OPENCL EXTENSION cl_khr_fp64: enable
+            #endif
+            """)
+
+    c_funcs = set(func.c_name for func in seen_functions)
+    if "int_floor_div" in c_funcs:
+        yield ("05_int_floor_div", """
+            #define int_floor_div(a,b) \
+              (( (a) - \
+                 ( ( (a)<0 ) != ( (b)<0 )) \
+                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
+               / (b) )
+            """)
+
+    if "int_floor_div_pos_b" in c_funcs:
+        yield ("05_int_floor_div_pos_b", """
+            #define int_floor_div_pos_b(a,b) ( \
+                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
+                )
+            """)
+
+# }}}
+
+
+# {{{ target
+
+class OpenCLTarget(TargetBase):
+    def function_manglers(self):
+        return (
+                super(OpenCLTarget, self).function_manglers() + [
+                    opencl_function_mangler
+                    ])
+
+    def symbol_manglers(self):
+        return (
+                super(OpenCLTarget, self).symbol_manglers() + [
+                    opencl_symbol_mangler
+                    ])
+
+    def preamble_generators(self):
+        from loopy.library.reduction import reduction_preamble_generator
+        return (
+                super(OpenCLTarget, self).preamble_generators() + [
+                    opencl_preamble_generator,
+                    reduction_preamble_generator
+                    ])
+
+    def get_or_register_dtype(self, names, dtype=None):
+        from loopy.target.opencl.compyte.dtypes import get_or_register_dtype
+        return get_or_register_dtype(names, dtype)
+
+    def dtype_to_typename(self, dtype):
+        from loopy.target.opencl.compyte.dtypes import dtype_to_ctype
+        return dtype_to_ctype(dtype)
+
+    def is_vector_dtype(self, dtype):
+        return list(vec.types.values())
+
+    def get_vector_dtype(self, base, count):
+        return vec.types[base, count]
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/target/opencl/compyte b/loopy/target/opencl/compyte
new file mode 160000
index 0000000000000000000000000000000000000000..5d54e1b2b7f28d3e779029ac0b4aa5f957829f23
--- /dev/null
+++ b/loopy/target/opencl/compyte
@@ -0,0 +1 @@
+Subproject commit 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fff7643c5919d95c612a411d083a3388bcfd384
--- /dev/null
+++ b/loopy/target/pyopencl/__init__.py
@@ -0,0 +1,282 @@
+"""OpenCL target integrated with PyOpenCL."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+from six.moves import range
+
+import numpy as np
+
+from loopy.target.opencl import OpenCLTarget
+
+import pyopencl as cl
+import pyopencl.characterize as cl_char
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+# {{{ temp storage adjust for bank conflict
+
+def adjust_local_temp_var_storage(kernel, device):
+    logger.debug("%s: adjust temp var storage" % kernel.name)
+
+    new_temp_vars = {}
+
+    lmem_size = cl_char.usable_local_mem_size(device)
+    for temp_var in six.itervalues(kernel.temporary_variables):
+        if not temp_var.is_local:
+            new_temp_vars[temp_var.name] = \
+                    temp_var.copy(storage_shape=temp_var.shape)
+            continue
+
+        other_loctemp_nbytes = [
+                tv.nbytes
+                for tv in six.itervalues(kernel.temporary_variables)
+                if tv.is_local and tv.name != temp_var.name]
+
+        storage_shape = temp_var.storage_shape
+
+        if storage_shape is None:
+            storage_shape = temp_var.shape
+
+        storage_shape = list(storage_shape)
+
+        # sizes of all dims except the last one, which we may change
+        # below to avoid bank conflicts
+        from pytools import product
+
+        if device.local_mem_type == cl.device_local_mem_type.GLOBAL:
+            # FIXME: could try to avoid cache associativity disasters
+            new_storage_shape = storage_shape
+
+        elif device.local_mem_type == cl.device_local_mem_type.LOCAL:
+            min_mult = cl_char.local_memory_bank_count(device)
+            good_incr = None
+            new_storage_shape = storage_shape
+            min_why_not = None
+
+            for increment in range(storage_shape[-1]//2):
+
+                test_storage_shape = storage_shape[:]
+                test_storage_shape[-1] = test_storage_shape[-1] + increment
+                new_mult, why_not = cl_char.why_not_local_access_conflict_free(
+                        device, temp_var.dtype.itemsize,
+                        temp_var.shape, test_storage_shape)
+
+                # will choose smallest increment 'automatically'
+                if new_mult < min_mult:
+                    new_lmem_use = (sum(other_loctemp_nbytes)
+                            + temp_var.dtype.itemsize*product(test_storage_shape))
+                    if new_lmem_use < lmem_size:
+                        new_storage_shape = test_storage_shape
+                        min_mult = new_mult
+                        min_why_not = why_not
+                        good_incr = increment
+
+            if min_mult != 1:
+                from warnings import warn
+                from loopy.diagnostic import LoopyAdvisory
+                warn("could not find a conflict-free mem layout "
+                        "for local variable '%s' "
+                        "(currently: %dx conflict, increment: %s, reason: %s)"
+                        % (temp_var.name, min_mult, good_incr, min_why_not),
+                        LoopyAdvisory)
+        else:
+            from warnings import warn
+            warn("unknown type of local memory")
+
+            new_storage_shape = storage_shape
+
+        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+
+    return kernel.copy(temporary_variables=new_temp_vars)
+
+# }}}
+
+
+# {{{ check sizes against device properties
+
+def check_sizes(kernel, device):
+    import loopy as lp
+
+    if device is None:
+        from loopy.diagnostic import warn
+        warn(kernel, "no_device_in_pre_codegen_checks",
+                "No device parameter was passed to the PyOpenCLTarget. "
+                "Perhaps you want to pass a device to benefit from "
+                "additional checking.")
+        return
+
+    from loopy.diagnostic import LoopyAdvisory, LoopyError
+
+    parameters = {}
+    for arg in kernel.args:
+        if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
+            parameters[arg.name] = arg.approximately
+
+    glens, llens = kernel.get_grid_sizes_as_exprs()
+
+    if (max(len(glens), len(llens))
+            > device.max_work_item_dimensions):
+        raise LoopyError("too many work item dimensions")
+
+    from pymbolic import evaluate
+    from pymbolic.mapper.evaluator import UnknownVariableError
+    try:
+        glens = evaluate(glens, parameters)
+        llens = evaluate(llens, parameters)
+    except UnknownVariableError as name:
+        from warnings import warn
+        warn("could not check axis bounds because no value "
+                "for variable '%s' was passed to check_kernels()"
+                % name, LoopyAdvisory)
+    else:
+        for i in range(len(llens)):
+            if llens[i] > device.max_work_item_sizes[i]:
+                raise LoopyError("group axis %d too big" % i)
+
+        from pytools import product
+        if product(llens) > device.max_work_group_size:
+            raise LoopyError("work group too big")
+
+    from pyopencl.characterize import usable_local_mem_size
+    if kernel.local_mem_use() > usable_local_mem_size(device):
+        raise LoopyError("using too much local memory")
+
+    from loopy.kernel.data import ConstantArg
+    const_arg_count = sum(
+            1 for arg in kernel.args
+            if isinstance(arg, ConstantArg))
+
+    if const_arg_count > device.max_constant_args:
+        raise LoopyError("too many constant arguments")
+
+# }}}
+
+
+def pyopencl_function_mangler(target, name, arg_dtypes):
+    if len(arg_dtypes) == 1 and isinstance(name, str):
+        arg_dtype, = arg_dtypes
+
+        if arg_dtype.kind == "c":
+            if arg_dtype == np.complex64:
+                tpname = "cfloat"
+            elif arg_dtype == np.complex128:
+                tpname = "cdouble"
+            else:
+                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
+
+            if name in ["sqrt", "exp", "log",
+                    "sin", "cos", "tan",
+                    "sinh", "cosh", "tanh",
+                    "conj"]:
+                return arg_dtype, "%s_%s" % (tpname, name)
+
+            if name in ["real", "imag", "abs"]:
+                return np.dtype(arg_dtype.type(0).real), "%s_%s" % (tpname, name)
+
+    return None
+
+
+# {{{ preamble generator
+
+def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
+    has_double = False
+    has_complex = False
+
+    for dtype in seen_dtypes:
+        if dtype in [np.float64, np.complex128]:
+            has_double = True
+        if dtype.kind == "c":
+            has_complex = True
+
+    if has_complex:
+        if has_double:
+            yield ("10_include_complex_header", """
+                #define PYOPENCL_DEFINE_CDOUBLE
+
+                #include <pyopencl-complex.h>
+                """)
+        else:
+            yield ("10_include_complex_header", """
+                #include <pyopencl-complex.h>
+                """)
+
+# }}}
+
+
+# {{{
+
+class PyOpenCLTarget(OpenCLTarget):
+    def __init__(self, device=None):
+        super(PyOpenCLTarget, self).__init__()
+
+        self.device = device
+
+    hash_fields = ["device"]
+    comparison_fields = ["device"]
+
+    def function_manglers(self):
+        return (
+                super(PyOpenCLTarget, self).function_manglers() + [
+                    pyopencl_function_mangler
+                    ])
+
+    def preamble_generators(self):
+        return ([
+            pyopencl_preamble_generator
+            ] + super(PyOpenCLTarget, self).preamble_generators())
+
+    def preprocess(self, kernel):
+        return kernel
+
+    def pre_codegen_check(self, kernel):
+        check_sizes(kernel, self.device)
+
+    def get_or_register_dtype(self, names, dtype=None):
+        from pyopencl.compyte.dtypes import get_or_register_dtype
+        return get_or_register_dtype(names, dtype)
+
+    def dtype_to_typename(self, dtype):
+        from pyopencl.compyte.dtypes import dtype_to_ctype
+        return dtype_to_ctype(dtype)
+
+    def is_vector_dtype(self, dtype):
+        from pyopencl.array import vec
+        return dtype in list(vec.types.values())
+
+    def vector_dtype(self, base, count):
+        from pyopencl.array import vec
+        return vec.types[base, count]
+
+    def alignment_requirement(self, type_decl):
+        import pyopencl._pvt_struct as _struct
+        return _struct.calcsize(type_decl.struct_format())
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/tools.py b/loopy/tools.py
index 19fdcf2ce5f5f1c02125edb616d5342f0b8f5b54..29005cfbb6aca798d6b0f51c3dabf9a2b75e53a4 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -106,7 +106,7 @@ class PicklableDtype(object):
     The issues are the following
 
     - :class:`numpy.dtype` objects for custom types in :mod:`loopy` are usually
-      registered in the :mod:`pyopencl` dtype registry. This registration may
+      registered in the target's dtype registry. This registration may
       have been lost after unpickling. This container restores it implicitly,
       as part of unpickling.
 
@@ -115,10 +115,13 @@ class PicklableDtype(object):
       by retrieving the 'canonical' type from the dtype registry.
     """
 
-    def __init__(self, dtype):
+    def __init__(self, dtype, target=None):
+        assert not isinstance(dtype, PicklableDtype)
+
         if dtype is None:
             raise TypeError("may not pass None to construct PicklableDtype")
 
+        self.target = target
         self.dtype = np.dtype(dtype)
 
     def __hash__(self):
@@ -133,14 +136,25 @@ class PicklableDtype(object):
         return not self.__eq__(self, other)
 
     def __getstate__(self):
-        from pyopencl.compyte.dtypes import DTYPE_TO_NAME
-        c_name = DTYPE_TO_NAME[self.dtype]
+        if self.target is None:
+            raise RuntimeError("unable to pickle dtype: target not known")
 
-        return (c_name, self.dtype)
+        c_name = self.target.dtype_to_typename(self.dtype)
+        return (self.target, c_name, self.dtype)
 
     def __setstate__(self, state):
-        name, dtype = state
-        from pyopencl.tools import get_or_register_dtype
-        self.dtype = get_or_register_dtype([name], dtype)
+        target, name, dtype = state
+        self.target = target
+        self.dtype = self.target.get_or_register_dtype([name], dtype)
+
+    def with_target(self, target):
+        if (self.target is not None
+                and target is not self.target):
+            raise RuntimeError("target already set to different value")
+
+        return PicklableDtype(self.dtype, target)
+
+    def assert_has_target(self):
+        assert self.target is not None
 
 # vim: foldmethod=marker
diff --git a/loopy/version.py b/loopy/version.py
index 8c58c4fe29bfdc35f31f9f2168edda7137e3a1bb..8f68a4958da821981a6922bd14335a2bc487473a 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -25,4 +25,4 @@ VERSION = (2014, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
-DATA_MODEL_VERSION = "v2"
+DATA_MODEL_VERSION = "v5"
diff --git a/setup.py b/setup.py
index d3d20fce6645c84b35f5f2da58fc3786fc576b70..b3cefdac0ad16fa7b4ffdf9969a47ac509f7a257 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,6 @@ setup(name="loo.py",
 
       install_requires=[
           "pytools>=2014.2",
-          "pyopencl>=2014.1",
           "pymbolic>=2014.1.1",
           "cgen>=2013.1.2",
           "islpy>=2014.2",