From 67669015c5d585c2c018e97f8f27c2187dde3ecb Mon Sep 17 00:00:00 2001
From: arghdos <arghdos@gmail.com>
Date: Mon, 1 May 2017 12:11:53 -0400
Subject: [PATCH] first attempt at subclassed executors / wrappers

---
 loopy/execution.py                 | 707 ++++++++++++++++++++++++++-
 loopy/target/c/__init__.py         |  10 +-
 loopy/target/c/c_execution.py      | 265 +++++++++++
 loopy/target/pyopencl_execution.py | 734 ++++++-----------------------
 4 files changed, 1121 insertions(+), 595 deletions(-)
 create mode 100644 loopy/target/c/c_execution.py

diff --git a/loopy/execution.py b/loopy/execution.py
index 5680fdbfe..65968e663 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -27,6 +27,8 @@ import six
 import numpy as np
 from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import LoopyError
+from pytools.py_codegen import (
+        Indentation, PythonFunctionGenerator)
 
 
 # {{{ object array argument packing
@@ -111,7 +113,613 @@ class SeparateArrayPackingController(object):
 # }}}
 
 
-# {{{ KernelExecutorBase
+# {{{ ExecutionWrapperGeneratorBase
+
+class ExecutionWrapperGeneratorBase(object):
+    """
+    A set of common methods for generating a wrapper
+    for execution of C-based languages
+
+    """
+
+    def __init__(self, system_args=["_lpy_c_kernels"]):
+        self.system_args = system_args[:]
+
+    def python_dtype_str(self, dtype):
+        if dtype.isbuiltin:
+            return "_lpy_np."+dtype.name
+        raise Exception('dtype: {} not recognized'.format(dtype))
+
+    # {{{ invoker generation
+
+    # /!\ This code runs in a namespace controlled by the user.
+    # Prefix all auxiliary variables with "_lpy".
+
+    # {{{ integer arg finding from shapes
+
+    def generate_integer_arg_finding_from_shapes(
+            self, gen, kernel, implemented_data_info):
+        # a mapping from integer argument names to a list of tuples
+        # (arg_name, expression), where expression is a
+        # unary function of kernel.arg_dict[arg_name]
+        # returning the desired integer argument.
+        iarg_to_sources = {}
+
+        from loopy.kernel.data import GlobalArg
+        from loopy.symbolic import DependencyMapper, StringifyMapper
+        from loopy.diagnostic import ParameterFinderWarning
+        dep_map = DependencyMapper()
+
+        from pymbolic import var
+        for arg in implemented_data_info:
+            if arg.arg_class is GlobalArg:
+                sym_shape = var(arg.name).attr("shape")
+                for axis_nr, shape_i in enumerate(arg.shape):
+                    if shape_i is None:
+                        continue
+
+                    deps = dep_map(shape_i)
+
+                    if len(deps) == 1:
+                        integer_arg_var, = deps
+
+                        if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
+                            from pymbolic.algorithm import solve_affine_equations_for
+                            try:
+                                # friggin' overkill :)
+                                iarg_expr = solve_affine_equations_for(
+                                        [integer_arg_var.name],
+                                        [(shape_i, sym_shape.index(axis_nr))]
+                                        )[integer_arg_var]
+                            except Exception as e:
+                                #from traceback import print_exc
+                                #print_exc()
+
+                                # went wrong? oh well
+                                from warnings import warn
+                                warn("Unable to generate code to automatically "
+                                        "find '%s' from the shape of '%s':\n%s"
+                                        % (integer_arg_var.name, arg.name, str(e)),
+                                        ParameterFinderWarning)
+                            else:
+                                iarg_to_sources.setdefault(integer_arg_var.name, [])\
+                                        .append((arg.name, iarg_expr))
+
+        gen("# {{{ find integer arguments from shapes")
+        gen("")
+
+        for iarg_name, sources in six.iteritems(iarg_to_sources):
+            gen("if %s is None:" % iarg_name)
+            with Indentation(gen):
+                if_stmt = "if"
+                for arg_name, value_expr in sources:
+                    gen("%s %s is not None:" % (if_stmt, arg_name))
+                    with Indentation(gen):
+                        gen("%s = %s"
+                                % (iarg_name, StringifyMapper()(value_expr)))
+
+                    if_stmt = "elif"
+
+            gen("")
+
+        gen("# }}}")
+        gen("")
+
+    # }}}
+
+    # {{{ integer arg finding from offsets
+
+    def generate_integer_arg_finding_from_offsets(self, gen, kernel,
+                                                  implemented_data_info):
+        options = kernel.options
+
+        gen("# {{{ find integer arguments from offsets")
+        gen("")
+
+        for arg in implemented_data_info:
+            impl_array_name = arg.offset_for_name
+            if impl_array_name is not None:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("if %s is None:" % impl_array_name)
+                    with Indentation(gen):
+                        gen("# Output variable, we'll be allocating "
+                                "it, with zero offset.")
+                        gen("%s = 0" % arg.name)
+                    gen("else:")
+                    with Indentation(gen):
+                        if not options.no_numpy:
+                            gen("_lpy_offset = getattr(%s, \"offset\", 0)"
+                                    % impl_array_name)
+                        else:
+                            gen("_lpy_offset = %s.offset" % impl_array_name)
+
+                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                        if not options.skip_arg_checks:
+                            gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
+                                    % (arg.name, base_arg.dtype.itemsize))
+
+                            gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
+                                    "not divisible by its dtype itemsize\""
+                                    % impl_array_name)
+                            gen("del _lpy_remdr")
+                        else:
+                            gen("%s = _lpy_offset // %d"
+                                    % (arg.name, base_arg.dtype.itemsize))
+
+                        if not options.skip_arg_checks:
+                            gen("del _lpy_offset")
+
+        gen("# }}}")
+        gen("")
+
+    # }}}
+
+    # {{{ integer arg finding from strides
+
+    def generate_integer_arg_finding_from_strides(
+            self, gen, kernel, implemented_data_info):
+        options = kernel.options
+
+        gen("# {{{ find integer arguments from strides")
+        gen("")
+
+        for arg in implemented_data_info:
+            if arg.stride_for_name_and_axis is not None:
+                impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
+
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    if not options.skip_arg_checks:
+                        gen("if %s is None:" % impl_array_name)
+                        with Indentation(gen):
+                            gen("raise RuntimeError(\"required stride '%s' for "
+                                    "argument '%s' not given or deducible from "
+                                    "passed array\")"
+                                    % (arg.name, impl_array_name))
+
+                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
+
+                        if not options.skip_arg_checks:
+                            gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
+                                    % (arg.name, impl_array_name, stride_impl_axis,
+                                        base_arg.dtype.dtype.itemsize))
+
+                            gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' "
+                                    " is not divisible by its dtype itemsize\""
+                                    % (stride_impl_axis, impl_array_name))
+                            gen("del _lpy_remdr")
+                        else:
+                            gen("%s = _lpy_offset // %d"
+                                    % (arg.name, base_arg.dtype.itemsize))
+
+        gen("# }}}")
+        gen("")
+
+    # }}}
+
+    # {{{ check that value args are present
+
+    def generate_value_arg_check(
+            self, gen, kernel, implemented_data_info):
+        if kernel.options.skip_arg_checks:
+            return
+
+        from loopy.kernel.data import ValueArg
+
+        gen("# {{{ check that value args are present")
+        gen("")
+
+        for arg in implemented_data_info:
+            if not issubclass(arg.arg_class, ValueArg):
+                continue
+
+            gen("if %s is None:" % arg.name)
+            with Indentation(gen):
+                gen("raise TypeError(\"value argument '%s' "
+                        "was not given and could not be automatically "
+                        "determined\")" % arg.name)
+
+        gen("# }}}")
+        gen("")
+
+    # }}}
+
+    # {{{ handle non numpy arguements
+
+    def handle_non_numpy_arg(self, gen, arg):
+        raise Exception('Non-numpy args are not allowed for C-execution')
+
+    # }}}
+
+    # {{{ handle allocation of unspecified arguements
+
+    def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
+        """
+        Handle allocation of non-specified arguements for C-execution
+        """
+        from pymbolic import var
+
+        num_axes = len(arg.unvec_shape)
+        for i in range(num_axes):
+            gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
+
+        sym_order = var('_lpy_order')
+        gen("%s = %s" % (strify(sym_order), arg.order))
+
+        sym_shape = tuple(
+                var("_lpy_shape_%d" % i)
+                for i in range(num_axes))
+
+        if not skip_arg_checks:
+            for i in range(num_axes):
+                gen("assert _lpy_shape_%d > 0, "
+                        "\"'%s' has negative shape in axis %d\""
+                        % (i, arg.name, i))
+
+        gen("%(name)s = _lpy_np.empty(%(shape)s, "
+                "%(dtype)s, order=%(order)s)"
+                % dict(
+                    name=arg.name,
+                    shape=strify(sym_shape),
+                    order=strify(sym_order),
+                    dtype=self.python_dtype_str(
+                        kernel_arg.dtype.numpy_dtype)))
+
+        if not skip_arg_checks:
+            for i in range(num_axes):
+                gen("del _lpy_shape_%d" % i)
+            gen("del %s" % strify(sym_order))
+            gen("")
+
+    # }}}
+
+    # {{{ arg setup
+
+    def generate_arg_setup(
+            self, gen, kernel, implemented_data_info, options):
+        import loopy as lp
+
+        from loopy.kernel.data import KernelArgument
+        from loopy.kernel.array import ArrayBase
+        from loopy.symbolic import StringifyMapper
+        from loopy.types import NumpyType
+
+        gen("# {{{ set up array arguments")
+        gen("")
+
+        if not options.no_numpy:
+            gen("_lpy_encountered_numpy = False")
+            gen("_lpy_encountered_dev = False")
+            gen("")
+
+        args = []
+
+        strify = StringifyMapper()
+
+        expect_no_more_arguments = False
+
+        for arg_idx, arg in enumerate(implemented_data_info):
+            is_written = arg.base_name in kernel.get_written_variables()
+            kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
+
+            if not issubclass(arg.arg_class, KernelArgument):
+                expect_no_more_arguments = True
+                continue
+
+            if expect_no_more_arguments:
+                raise LoopyError("Further arguments encountered after arg info "
+                        "describing a global temporary variable")
+
+            if not issubclass(arg.arg_class, ArrayBase):
+                args.append(arg.name)
+                continue
+
+            gen("# {{{ process %s" % arg.name)
+            gen("")
+
+            if not options.no_numpy:
+                self.handle_non_numpy_arg(gen, arg)
+
+            if not options.skip_arg_checks and not is_written:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"input argument '%s' must "
+                            "be supplied\")" % arg.name)
+                    gen("")
+
+            if (is_written
+                    and arg.arg_class is lp.ImageArg
+                    and not options.skip_arg_checks):
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"written image '%s' must "
+                            "be supplied\")" % arg.name)
+                    gen("")
+
+            if is_written and arg.shape is None and not options.skip_arg_checks:
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    gen("raise RuntimeError(\"written argument '%s' has "
+                            "unknown shape and must be supplied\")" % arg.name)
+                    gen("")
+
+            possibly_made_by_loopy = False
+
+            # {{{ allocate written arrays, if needed
+
+            if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                    and arg.shape is not None:
+
+                if not isinstance(arg.dtype, NumpyType):
+                    raise LoopyError("do not know how to pass arg of type '%s'"
+                            % arg.dtype)
+
+                possibly_made_by_loopy = True
+                gen("_lpy_made_by_loopy = False")
+                gen("")
+
+                gen("if %s is None:" % arg.name)
+                with Indentation(gen):
+                    self.handle_alloc(
+                        gen, arg, kernel_arg, strify, options.skip_arg_checks)
+                    gen("_lpy_made_by_loopy = True")
+                    gen("")
+
+            # }}}
+
+            # {{{ argument checking
+
+            if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
+                    and not options.skip_arg_checks:
+                if possibly_made_by_loopy:
+                    gen("if not _lpy_made_by_loopy:")
+                else:
+                    gen("if True:")
+
+                with Indentation(gen):
+                    gen("if %s.dtype != %s:"
+                            % (arg.name, self.python_dtype_str(
+                                kernel_arg.dtype.numpy_dtype)))
+                    with Indentation(gen):
+                        gen("raise TypeError(\"dtype mismatch on argument '%s' "
+                                "(got: %%s, expected: %s)\" %% %s.dtype)"
+                                % (arg.name, arg.dtype, arg.name))
+
+                    # {{{ generate shape checking code
+
+                    def strify_allowing_none(shape_axis):
+                        if shape_axis is None:
+                            return "None"
+                        else:
+                            return strify(shape_axis)
+
+                    def strify_tuple(t):
+                        if len(t) == 0:
+                            return "()"
+                        else:
+                            return "(%s,)" % ", ".join(
+                                    strify_allowing_none(sa)
+                                    for sa in t)
+
+                    shape_mismatch_msg = (
+                            "raise TypeError(\"shape mismatch on argument '%s' "
+                            "(got: %%s, expected: %%s)\" "
+                            "%% (%s.shape, %s))"
+                            % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
+
+                    if kernel_arg.shape is None:
+                        pass
+
+                    elif any(shape_axis is None for shape_axis in kernel_arg.shape):
+                        gen("if len(%s.shape) != %s:"
+                                % (arg.name, len(arg.unvec_shape)))
+                        with Indentation(gen):
+                            gen(shape_mismatch_msg)
+
+                        for i, shape_axis in enumerate(arg.unvec_shape):
+                            if shape_axis is None:
+                                continue
+
+                            gen("if %s.shape[%d] != %s:"
+                                    % (arg.name, i, strify(shape_axis)))
+                            with Indentation(gen):
+                                gen(shape_mismatch_msg)
+
+                    else:  # not None, no Nones in tuple
+                        gen("if %s.shape != %s:"
+                                % (arg.name, strify(arg.unvec_shape)))
+                        with Indentation(gen):
+                            gen(shape_mismatch_msg)
+
+                    # }}}
+
+                    if arg.unvec_strides and kernel_arg.dim_tags:
+                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+                        sym_strides = tuple(
+                                itemsize*s_i for s_i in arg.unvec_strides)
+                        gen("if %s.strides != %s:"
+                                % (arg.name, strify(sym_strides)))
+                        with Indentation(gen):
+                            gen("raise TypeError(\"strides mismatch on "
+                                    "argument '%s' (got: %%s, expected: %%s)\" "
+                                    "%% (%s.strides, %s))"
+                                    % (arg.name, arg.name, strify(sym_strides)))
+
+                    if not arg.allows_offset:
+                        gen("if %s.offset:" % arg.name)
+                        with Indentation(gen):
+                            gen("raise ValueError(\"Argument '%s' does not "
+                                    "allow arrays with offsets. Try passing "
+                                    "default_offset=loopy.auto to make_kernel()."
+                                    "\")" % arg.name)
+                            gen("")
+
+            # }}}
+
+            if possibly_made_by_loopy and not options.skip_arg_checks:
+                gen("del _lpy_made_by_loopy")
+                gen("")
+
+            if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
+                args.append("%s.base_data" % arg.name)
+            else:
+                args.append("%s" % arg.name)
+
+            gen("")
+
+            gen("# }}}")
+            gen("")
+
+        gen("# }}}")
+        gen("")
+
+        return args
+
+    # }}}
+
+    def target_specific_preamble(self, gen):
+        """
+        Add default C-imports to preamble
+        """
+        gen.add_to_preamble("import numpy as _lpy_np")
+        gen.add_to_preamble("import loopy.target.c_execution as _lpy_c")
+
+    def intialize_system_args(self, gen):
+        """
+        Override to intialize any default system args
+        """
+        pass
+
+    # {{{ generate invocation
+
+    def generate_invocation(self, gen, kernel_name, args):
+        gen("for knl in _lpy_c_kernels:")
+        with Indentation(gen):
+            gen("{kernel_name}({args})"
+                    .format(
+                        kernel_name='knl.name',
+                        args=", ".join(args)))
+
+    # }}}
+
+    # {{{ output
+
+    def generate_output_handler(
+            self, gen, options, kernel, implemented_data_info):
+
+        from loopy.kernel.data import KernelArgument
+
+        if not options.no_numpy:
+            gen("if out_host is None and (_lpy_encountered_numpy "
+                    "and not _lpy_encountered_dev):")
+            with Indentation(gen):
+                gen("out_host = True")
+
+            gen("if out_host:")
+            with Indentation(gen):
+                gen("pass")  # if no outputs (?!)
+                for arg in implemented_data_info:
+                    if not issubclass(arg.arg_class, KernelArgument):
+                        continue
+
+            gen("")
+
+        if options.return_dict:
+            gen("return None, {%s}"
+                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                        for arg in implemented_data_info
+                        if issubclass(arg.arg_class, KernelArgument)
+                        if arg.base_name in kernel.get_written_variables()))
+        else:
+            out_args = [arg
+                    for arg in implemented_data_info
+                        if issubclass(arg.arg_class, KernelArgument)
+                    if arg.base_name in kernel.get_written_variables()]
+            if out_args:
+                gen("return None, (%s,)"
+                        % ", ".join(arg.name for arg in out_args))
+            else:
+                gen("return None, ()")
+
+    # }}}
+
+    def __call__(self, kernel, codegen_result):
+        """
+        Generates the wrapping python invoker for this execution target
+
+        :arg kernel: the loopy :class:`LoopKernel`(s) to be executued
+        :codegen_result: the loopy :class:`CodeGenerationResult` created
+        by code generation
+
+        :returns: py_func, a python function that handles excution of this
+        kernel
+        """
+        options = kernel.options
+        implemented_data_info = codegen_result.implemented_data_info
+        host_code = codegen_result.host_code()
+
+        from loopy.kernel.data import KernelArgument
+        gen = PythonFunctionGenerator(
+                "invoke_%s_loopy_kernel" % kernel.name,
+                self.system_args + [
+                    "%s=None" % idi.name
+                    for idi in implemented_data_info
+                    if issubclass(idi.arg_class, KernelArgument)
+                    ])
+
+        gen.add_to_preamble("from __future__ import division")
+        gen.add_to_preamble("")
+        self.target_specific_preamble(gen)
+        gen.add_to_preamble("")
+        gen.add_to_preamble(host_code)
+        gen.add_to_preamble("")
+
+        self.intialize_system_args(gen)
+
+        self.generate_integer_arg_finding_from_shapes(
+            gen, kernel, implemented_data_info)
+        self.generate_integer_arg_finding_from_offsets(
+            gen, kernel, implemented_data_info)
+        self.generate_integer_arg_finding_from_strides(
+            gen, kernel, implemented_data_info)
+        self.generate_value_arg_check(
+            gen, kernel, implemented_data_info)
+
+        args = self.generate_arg_setup(
+            gen, kernel, implemented_data_info, options)
+
+        self.generate_invocation(codegen_result.host_program.name, args)
+
+        self.generate_output_handler(gen, options, kernel, implemented_data_info)
+
+        if options.write_wrapper:
+            output = gen.get()
+            if options.highlight_wrapper:
+                output = get_highlighted_python_code(output)
+
+            if options.write_wrapper is True:
+                print(output)
+            else:
+                with open(options.write_wrapper, "w") as outf:
+                    outf.write(output)
+
+        return gen.get_function()
+
+
+# }}}
+
+
+class _KernelInfo(ImmutableRecord):
+    pass
+
+
+class _Kernels(object):
+    pass
+
+
+# {{{ kernel executor
 
 class KernelExecutorBase(object):
     """An object connecting a kernel to a :class:`pyopencl.Context`
@@ -121,7 +729,7 @@ class KernelExecutorBase(object):
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel):
+    def __init__(self, kernel, invoker=ExecutionWrapperGeneratorBase()):
         """
         :arg kernel: a loopy.LoopKernel
         """
@@ -137,6 +745,8 @@ class KernelExecutorBase(object):
                 arg.dtype is None
                 for arg in kernel.args)
 
+        self.invoker = invoker
+
     @memoize_method
     def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
         kernel = self.kernel
@@ -195,6 +805,99 @@ class KernelExecutorBase(object):
 
         return frozenset(six.iteritems(arg_to_dtype))
 
+    # {{{ debugging aids
+
+    def get_highlighted_code(self, arg_to_dtype=None):
+        return get_highlighted_code(
+                self.get_code(arg_to_dtype))
+
+    def get_code(self, arg_to_dtype=None):
+        if arg_to_dtype is not None:
+            arg_to_dtype = frozenset(six.iteritems(arg_to_dtype))
+
+        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
+
+        from loopy.codegen import generate_code_v2
+        code = generate_code_v2(kernel)
+        return code.device_code()
+
+    # }}}
+
+    # {{{ call and info generator
+
+    @memoize_method
+    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+        raise NotImplementedError()
+
+    def __call__(self, queue, **kwargs):
+        """
+        :arg allocator: a callable passed a byte count and returning
+            a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator
+            maybe.
+        :arg wait_for: A list of :class:`pyopencl.Event` instances
+            for which to wait.
+        :arg out_host: :class:`bool`
+            Decides whether output arguments (i.e. arguments
+            written by the kernel) are to be returned as
+            :mod:`numpy` arrays. *True* for yes, *False* for no.
+
+            For the default value of *None*, if all (input) array
+            arguments are :mod:`numpy` arrays, defaults to
+            returning :mod:`numpy` arrays as well.
+
+        :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event`
+            associated with the execution of the kernel, and
+            output is a tuple of output arguments (arguments that
+            are written as part of the kernel). The order is given
+            by the order of kernel arguments. If this order is unspecified
+            (such as when kernel arguments are inferred automatically),
+            enable :attr:`loopy.Options.return_dict` to make *output* a
+            :class:`dict` instead, with keys of argument names and values
+            of the returned arrays.
+        """
+
+        allocator = kwargs.pop("allocator", None)
+        wait_for = kwargs.pop("wait_for", None)
+        out_host = kwargs.pop("out_host", None)
+
+        kwargs = self.packing_controller.unpack(kwargs)
+
+        kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs))
+
+        return kernel_info.invoker(
+                kernel_info.cl_kernels, queue, allocator, wait_for,
+                out_host, **kwargs)
+
+    # }}}
+
+# }}}
+
+# {{{ code highlighers
+
+
+def get_highlighted_python_code(text):
+    try:
+        from pygments import highlight
+    except ImportError:
+        return text
+    else:
+        from pygments.lexers import PythonLexer
+        from pygments.formatters import TerminalFormatter
+
+        return highlight(text, PythonLexer(), TerminalFormatter())
+
+
+def get_highlighted_code(text):
+    try:
+        from pygments import highlight
+    except ImportError:
+        return text
+    else:
+        from pygments.lexers import CLexer
+        from pygments.formatters import TerminalFormatter
+
+        return highlight(text, CLexer(), TerminalFormatter())
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index e4835a363..85d751260 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -260,8 +260,9 @@ class CTarget(TargetBase):
     hash_fields = TargetBase.hash_fields + ("fortran_abi",)
     comparison_fields = TargetBase.comparison_fields + ("fortran_abi",)
 
-    def __init__(self, fortran_abi=False):
+    def __init__(self, fortran_abi=False, compiler=None):
         self.fortran_abi = fortran_abi
+        self.compiler = compiler
         super(CTarget, self).__init__()
 
     def split_kernel_at_global_barriers(self):
@@ -298,6 +299,13 @@ class CTarget(TargetBase):
         # These kind of shouldn't be here.
         return self.get_dtype_registry().dtype_to_ctype(dtype)
 
+    def get_kernel_executor_cache_key(self, *args, **kwargs):
+        return self.compiler
+
+    def get_kernel_executor(self, knl, *args, **kwargs):
+        from loopy.target.c import CKernelExecutor
+        return CKernelExecutor(knl, self.compiler)
+
     # }}}
 
 
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
new file mode 100644
index 000000000..230777529
--- /dev/null
+++ b/loopy/target/c/c_execution.py
@@ -0,0 +1,265 @@
+from __future__ import division, with_statement, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import tempfile
+import cgen
+import os
+import subprocess
+
+from loopy.target.c import CTarget, generate_header
+from loopy.codegen import generate_code
+from loopy.execution import (KernelExecutorBase, _Kernels,
+                             _KernelInfo, ExecutionWrapperGeneratorBase)
+from pytools import memoize_method
+import weakref
+
+import ctypes
+
+import logging
+logger = logging.getLogger(__name__)
+
+"""
+The compiler module handles invocation of compilers to generate a shared lib
+which can be loaded via ctypes.
+"""
+
+
+class CCompiler(object):
+    """
+    Wraps a C compiler to build and load shared libraries.
+    Defaults to gcc
+    """
+
+    source_suffix = 'c'
+    default_exe = 'gcc'
+    default_compile_flags = '-std=c99 -g -O3 -fPIC'.split()
+    default_link_flags = '-shared'.split()
+
+    def __init__(self, cc=None,
+                 cflags=None,
+                 ldflags=None):
+        self.exe = cc if cc else self.default_exe
+        self.cflags = cflags or self.default_compile_flags[:]
+        self.ldflags = ldflags or self.default_link_flags[:]
+        self.tempdir = tempfile.TemporaryDirectory()
+
+    def _tempname(self, name):
+        """Build temporary filename path in tempdir."""
+        return os.path.join(self.tempdir.name, name)
+
+    def _call(self, args, **kwargs):
+        """Invoke compiler with arguments."""
+        cwd = self.tempdir.name
+        args_ = [self.exe] + args
+        logger.debug(args_)
+        subprocess.check_call(args_, cwd=cwd, **kwargs)
+
+    def build(self, code):
+        """Compile code, build and load shared library."""
+        logger.debug(code)
+        c_fname = self._tempname('code.' + self.source_suffix)
+        obj_fname = self._tempname('code.o')
+        dll_fname = self._tempname('code.so')
+        with open(c_fname, 'w') as fd:
+            fd.write(code)
+        self._call(self.compile_args(c_fname))
+        self._call(self.link_args(obj_fname, dll_fname))
+        return ctypes.CDLL(dll_fname)
+
+    def compile_args(self, c_fname):
+        "Construct args for compile command."
+        return self.cflags + ['-c', c_fname]
+
+    def link_args(self, obj_fname, dll_fname):
+        "Construct args for link command."
+        return self.ldflags + ['-shared', obj_fname, '-o', dll_fname]
+
+
+class CppCompiler(CCompiler):
+    """Subclass of Compiler to invoke a C++ compiler.
+       Defaults to g++"""
+    source_suffix = 'cpp'
+    default_exe = 'g++'
+    default_compile_flags = '-g -O3'.split()
+
+
+class CompiledKernel(object):
+    """
+    A CompiledKernel wraps a loopy kernel, compiling it and loading the
+    result as a shared library, and provides access to the kernel as a
+    ctypes function object, wrapped by the __call__ method, which attempts
+    to automatically map argument types.
+    """
+
+    def __init__(self, knl, comp=None):
+        assert isinstance(knl.target, CTarget)
+        self.knl = knl
+        self.code, _ = generate_code(knl)
+        self.comp = comp or CCompiler()
+        self.dll = self.comp.build(self.code)
+        self.func_decl, = generate_header(knl)
+        self._arg_info = []
+        # TODO knl.args[:].dtype is sufficient
+        self._visit_func_decl(self.func_decl)
+        self.name = self.knl.name
+        restype = self.func_decl.subdecl.typename
+        if restype == 'void':
+            self.restype = None
+        else:
+            raise ValueError('Unhandled restype %r' % (restype, ))
+        self._fn = getattr(self.dll, self.name)
+        self._fn.restype = self.restype
+        self._fn.argtypes = [ctype for name, ctype in self._arg_info]
+        self._prepared_call_cache = weakref.WeakKeyDictionary()
+
+    def __call__(self, **kwargs):
+        """Execute kernel with given args mapped to ctypes equivalents."""
+        args_ = []
+        for knl_arg, arg_t in zip(self.knl.args, self._fn.argtypes):
+            arg = kwargs[knl_arg.name]
+            if hasattr(arg, 'ctypes'):
+                if arg.size == 0:
+                    # TODO eliminate unused arguments from kernel
+                    arg_ = arg_t(0.0)
+                else:
+                    arg_ = arg.ctypes.data_as(arg_t)
+            else:
+                arg_ = arg_t(arg)
+            args_.append(arg_)
+        self._fn(*args_)
+
+    def _append_arg(self, name, dtype, pointer=False):
+        """Append arg info to current argument list."""
+        self._arg_info.append((
+            name,
+            self._dtype_to_ctype(dtype, pointer=pointer)
+        ))
+
+    def _visit_const(self, node):
+        """Visit const arg of kernel."""
+        if isinstance(node.subdecl, cgen.RestrictPointer):
+            self._visit_pointer(node.subdecl)
+        else:
+            pod = node.subdecl  # type: cgen.POD
+            self._append_arg(pod.name, pod.dtype)
+
+    def _visit_pointer(self, node):
+        "Visit pointer argument of kernel."
+        pod = node.subdecl  # type: cgen.POD
+        self._append_arg(pod.name, pod.dtype, pointer=True)
+
+    def _visit_func_decl(self, func_decl):
+        """Visit nodes of function declaration of kernel."""
+        for i, arg in enumerate(func_decl.arg_decls):
+            if isinstance(arg, cgen.Const):
+                self._visit_const(arg)
+            elif isinstance(arg, cgen.RestrictPointer):
+                self._visit_pointer(arg)
+            else:
+                raise ValueError('unhandled type for arg %r' % (arg, ))
+
+    def _dtype_to_ctype(self, dtype, pointer=False):
+        """Map NumPy dtype to equivalent ctypes type."""
+        target = self.knl.target  # type: CTarget
+        registry = target.get_dtype_registry().wrapped_registry
+        typename = registry.dtype_to_ctype(dtype)
+        typename = {'unsigned': 'uint'}.get(typename, typename)
+        basetype = getattr(ctypes, 'c_' + typename)
+        if pointer:
+            return ctypes.POINTER(basetype)
+        return basetype
+
+
+class CKernelExecutor(KernelExecutorBase):
+    """An object connecting a kernel to a :class:`CompiledKernel`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, kernel, compiler=None):
+        """
+        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
+            (a warning will be issued if more than one is returned). If the
+            kernel has not yet been loop-scheduled, that is done, too, with no
+            specific arguments.
+        """
+
+        self.compiler = compiler if compiler else CCompiler()
+        super(CKernelExecutor, self).__init__(kernel)
+
+    @memoize_method
+    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+
+        from loopy.codegen import generate_code_v2
+        codegen_result = generate_code_v2(kernel)
+
+        dev_code = codegen_result.device_code()
+
+        if self.kernel.options.write_cl:
+            output = dev_code
+            if self.kernel.options.highlight_cl:
+                output = self.get_highlighted_code(output)
+
+            if self.kernel.options.write_cl is True:
+                print(output)
+            else:
+                with open(self.kernel.options.write_cl, "w") as outf:
+                    outf.write(output)
+
+        if self.kernel.options.edit_cl:
+            from pytools import invoke_editor
+            dev_code = invoke_editor(dev_code, "code.cl")
+
+        c_kernels = _Kernels()
+        for dp in codegen_result.device_programs:
+            setattr(c_kernels, dp.name, CompiledKernel(dp, self.compiler))
+
+        return _KernelInfo(
+                kernel=kernel,
+                c_kernels=c_kernels,
+                implemented_data_info=codegen_result.implemented_data_info,
+                invoker=self.invoker(kernel, codegen_result))
+
+    # }}}
+
+    def __call__(self, **kwargs):
+        """
+        :returns: ``(None, output)`` the output is a tuple of output arguments
+            (arguments that are written as part of the kernel). The order is given
+            by the order of kernel arguments. If this order is unspecified
+            (such as when kernel arguments are inferred automatically),
+            enable :attr:`loopy.Options.return_dict` to make *output* a
+            :class:`dict` instead, with keys of argument names and values
+            of the returned arrays.
+        """
+
+        kwargs = self.packing_controller.unpack(kwargs)
+
+        kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
+
+        return kernel_info.invoker(
+                kernel_info.c_kernels, **kwargs)
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index a8f47adb9..a2574bf8a 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -22,17 +22,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import six
 from six.moves import range, zip
 
 from pytools import ImmutableRecord, memoize_method
-from loopy.diagnostic import ParameterFinderWarning
-from pytools.py_codegen import (
-        Indentation, PythonFunctionGenerator)
-from loopy.diagnostic import LoopyError
-from loopy.types import NumpyType
-from loopy.execution import KernelExecutorBase
-
+from pytools.py_codegen import Indentation
+from loopy.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase,
+                             _KernelInfo, _Kernels)
 import logging
 logger = logging.getLogger(__name__)
 
@@ -43,507 +38,123 @@ logger = logging.getLogger(__name__)
 # Prefix all auxiliary variables with "_lpy".
 
 
-def python_dtype_str(dtype):
-    import pyopencl.tools as cl_tools
-    if dtype.isbuiltin:
-        return "_lpy_np."+dtype.name
-    else:
-        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
-                % cl_tools.dtype_to_ctype(dtype))
-
-
-# {{{ integer arg finding from shapes
-
-def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info):
-    # a mapping from integer argument names to a list of tuples
-    # (arg_name, expression), where expression is a
-    # unary function of kernel.arg_dict[arg_name]
-    # returning the desired integer argument.
-    iarg_to_sources = {}
-
-    from loopy.kernel.data import GlobalArg
-    from loopy.symbolic import DependencyMapper, StringifyMapper
-    dep_map = DependencyMapper()
-
-    from pymbolic import var
-    for arg in implemented_data_info:
-        if arg.arg_class is GlobalArg:
-            sym_shape = var(arg.name).attr("shape")
-            for axis_nr, shape_i in enumerate(arg.shape):
-                if shape_i is None:
-                    continue
-
-                deps = dep_map(shape_i)
-
-                if len(deps) == 1:
-                    integer_arg_var, = deps
-
-                    if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
-                        from pymbolic.algorithm import solve_affine_equations_for
-                        try:
-                            # friggin' overkill :)
-                            iarg_expr = solve_affine_equations_for(
-                                    [integer_arg_var.name],
-                                    [(shape_i, sym_shape.index(axis_nr))]
-                                    )[integer_arg_var]
-                        except Exception as e:
-                            #from traceback import print_exc
-                            #print_exc()
-
-                            # went wrong? oh well
-                            from warnings import warn
-                            warn("Unable to generate code to automatically "
-                                    "find '%s' from the shape of '%s':\n%s"
-                                    % (integer_arg_var.name, arg.name, str(e)),
-                                    ParameterFinderWarning)
-                        else:
-                            iarg_to_sources.setdefault(integer_arg_var.name, []) \
-                                    .append((arg.name, iarg_expr))
-
-    gen("# {{{ find integer arguments from shapes")
-    gen("")
-
-    for iarg_name, sources in six.iteritems(iarg_to_sources):
-        gen("if %s is None:" % iarg_name)
-        with Indentation(gen):
-            if_stmt = "if"
-            for arg_name, value_expr in sources:
-                gen("%s %s is not None:" % (if_stmt, arg_name))
-                with Indentation(gen):
-                    gen("%s = %s"
-                            % (iarg_name, StringifyMapper()(value_expr)))
-
-                if_stmt = "elif"
-
-        gen("")
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ integer arg finding from offsets
-
-def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info):
-    options = kernel.options
-
-    gen("# {{{ find integer arguments from offsets")
-    gen("")
-
-    for arg in implemented_data_info:
-        impl_array_name = arg.offset_for_name
-        if impl_array_name is not None:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("if %s is None:" % impl_array_name)
-                with Indentation(gen):
-                    gen("# Output variable, we'll be allocating "
-                            "it, with zero offset.")
-                    gen("%s = 0" % arg.name)
-                gen("else:")
-                with Indentation(gen):
-                    if not options.no_numpy:
-                        gen("_lpy_offset = getattr(%s, \"offset\", 0)"
-                                % impl_array_name)
-                    else:
-                        gen("_lpy_offset = %s.offset" % impl_array_name)
-
-                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                    if not options.skip_arg_checks:
-                        gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-                        gen("assert _lpy_remdr == 0, \"Offset of array '%s' is "
-                                "not divisible by its dtype itemsize\""
-                                % impl_array_name)
-                        gen("del _lpy_remdr")
-                    else:
-                        gen("%s = _lpy_offset // %d"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-                    if not options.skip_arg_checks:
-                        gen("del _lpy_offset")
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ integer arg finding from strides
-
-def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info):
-    options = kernel.options
-
-    gen("# {{{ find integer arguments from strides")
-    gen("")
-
-    for arg in implemented_data_info:
-        if arg.stride_for_name_and_axis is not None:
-            impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis
-
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                if not options.skip_arg_checks:
-                    gen("if %s is None:" % impl_array_name)
-                    with Indentation(gen):
-                        gen("raise RuntimeError(\"required stride '%s' for "
-                                "argument '%s' not given or deducible from "
-                                "passed array\")"
-                                % (arg.name, impl_array_name))
-
-                    base_arg = kernel.impl_arg_to_arg[impl_array_name]
-
-                    if not options.skip_arg_checks:
-                        gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
-                                % (arg.name, impl_array_name, stride_impl_axis,
-                                    base_arg.dtype.dtype.itemsize))
-
-                        gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is "
-                                "not divisible by its dtype itemsize\""
-                                % (stride_impl_axis, impl_array_name))
-                        gen("del _lpy_remdr")
-                    else:
-                        gen("%s = _lpy_offset // %d"
-                                % (arg.name, base_arg.dtype.itemsize))
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ check that value args are present
-
-def generate_value_arg_check(gen, kernel, implemented_data_info):
-    if kernel.options.skip_arg_checks:
-        return
+class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
+    """
+    Specialized form of the :class:`ExecutionWrapperGeneratorBase` for
+    pyopencl execution
+    """
 
-    from loopy.kernel.data import ValueArg
+    def __init__(self):
+        system_args = [
+            "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
+            # ignored if options.no_numpy
+            "out_host=None"
+            ]
+        super(PyOpenCLExecutionWrapperGenerator, self).__init__(system_args)
 
-    gen("# {{{ check that value args are present")
-    gen("")
+    def python_dtype_str(self, dtype):
+        import pyopencl.tools as cl_tools
+        if dtype.isbuiltin:
+            return "_lpy_np."+dtype.name
+        else:
+            return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
+                    % cl_tools.dtype_to_ctype(dtype))
 
-    for arg in implemented_data_info:
-        if not issubclass(arg.arg_class, ValueArg):
-            continue
+    # {{{ handle non-numpy args
 
-        gen("if %s is None:" % arg.name)
+    def handle_non_numpy_arg(self, gen, arg):
+        gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
         with Indentation(gen):
-            gen("raise TypeError(\"value argument '%s' "
-                    "was not given and could not be automatically "
-                    "determined\")" % arg.name)
-
-    gen("# }}}")
-    gen("")
-
-# }}}
-
-
-# {{{ arg setup
-
-def generate_arg_setup(gen, kernel, implemented_data_info, options):
-    import loopy as lp
-
-    from loopy.kernel.data import KernelArgument
-    from loopy.kernel.array import ArrayBase
-    from loopy.symbolic import StringifyMapper
-    from pymbolic import var
-
-    gen("# {{{ set up array arguments")
-    gen("")
-
-    if not options.no_numpy:
-        gen("_lpy_encountered_numpy = False")
-        gen("_lpy_encountered_dev = False")
-        gen("")
-
-    args = []
-
-    strify = StringifyMapper()
-
-    expect_no_more_arguments = False
-
-    for arg_idx, arg in enumerate(implemented_data_info):
-        is_written = arg.base_name in kernel.get_written_variables()
-        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
-
-        if not issubclass(arg.arg_class, KernelArgument):
-            expect_no_more_arguments = True
-            continue
-
-        if expect_no_more_arguments:
-            raise LoopyError("Further arguments encountered after arg info "
-                    "describing a global temporary variable")
-
-        if not issubclass(arg.arg_class, ArrayBase):
-            args.append(arg.name)
-            continue
+            gen("# synchronous, nothing to worry about")
+            gen("%s = _lpy_cl_array.to_device("
+                    "queue, %s, allocator=allocator)"
+                    % (arg.name, arg.name))
+            gen("_lpy_encountered_numpy = True")
+        gen("elif %s is not None:" % arg.name)
+        with Indentation(gen):
+            gen("_lpy_encountered_dev = True")
 
-        gen("# {{{ process %s" % arg.name)
         gen("")
 
-        if not options.no_numpy:
-            gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
-            with Indentation(gen):
-                gen("# synchronous, nothing to worry about")
-                gen("%s = _lpy_cl_array.to_device("
-                        "queue, %s, allocator=allocator)"
-                        % (arg.name, arg.name))
-                gen("_lpy_encountered_numpy = True")
-            gen("elif %s is not None:" % arg.name)
-            with Indentation(gen):
-                gen("_lpy_encountered_dev = True")
-
-            gen("")
-
-        if not options.skip_arg_checks and not is_written:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"input argument '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if (is_written
-                and arg.arg_class is lp.ImageArg
-                and not options.skip_arg_checks):
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"written image '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if is_written and arg.shape is None and not options.skip_arg_checks:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"written argument '%s' has "
-                        "unknown shape and must be supplied\")" % arg.name)
-                gen("")
-
-        possibly_made_by_loopy = False
-
-        # {{{ allocate written arrays, if needed
+    # {{{ handle allocation of unspecified arguements
 
-        if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                and arg.shape is not None:
-
-            if not isinstance(arg.dtype, NumpyType):
-                raise LoopyError("do not know how to pass arg of type '%s'"
-                        % arg.dtype)
-
-            possibly_made_by_loopy = True
-            gen("_lpy_made_by_loopy = False")
-            gen("")
-
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                num_axes = len(arg.strides)
-                for i in range(num_axes):
-                    gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
-
-                itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                for i in range(num_axes):
-                    gen("_lpy_strides_%d = %s" % (i, strify(
-                        itemsize*arg.unvec_strides[i])))
-
-                if not options.skip_arg_checks:
-                    for i in range(num_axes):
-                        gen("assert _lpy_strides_%d > 0, "
-                                "\"'%s' has negative stride in axis %d\""
-                                % (i, arg.name, i))
-
-                sym_strides = tuple(
-                        var("_lpy_strides_%d" % i)
-                        for i in range(num_axes))
-                sym_shape = tuple(
-                        var("_lpy_shape_%d" % i)
-                        for i in range(num_axes))
-
-                alloc_size_expr = (sum(astrd*(alen-1)
-                    for alen, astrd in zip(sym_shape, sym_strides))
-                    + itemsize)
-
-                gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
-                gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
-                        "%(dtype)s, strides=%(strides)s, "
-                        "data=allocator(_lpy_alloc_size), allocator=allocator)"
-                        % dict(
-                            name=arg.name,
-                            shape=strify(sym_shape),
-                            strides=strify(sym_strides),
-                            dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))
-
-                if not options.skip_arg_checks:
-                    for i in range(num_axes):
-                        gen("del _lpy_shape_%d" % i)
-                        gen("del _lpy_strides_%d" % i)
-                    gen("del _lpy_alloc_size")
-                    gen("")
-
-                gen("_lpy_made_by_loopy = True")
-                gen("")
-
-        # }}}
-
-        # {{{ argument checking
-
-        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                and not options.skip_arg_checks:
-            if possibly_made_by_loopy:
-                gen("if not _lpy_made_by_loopy:")
-            else:
-                gen("if True:")
-
-            with Indentation(gen):
-                gen("if %s.dtype != %s:"
-                        % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
-                with Indentation(gen):
-                    gen("raise TypeError(\"dtype mismatch on argument '%s' "
-                            "(got: %%s, expected: %s)\" %% %s.dtype)"
-                            % (arg.name, arg.dtype, arg.name))
-
-                # {{{ generate shape checking code
-
-                def strify_allowing_none(shape_axis):
-                    if shape_axis is None:
-                        return "None"
-                    else:
-                        return strify(shape_axis)
-
-                def strify_tuple(t):
-                    if len(t) == 0:
-                        return "()"
-                    else:
-                        return "(%s,)" % ", ".join(
-                                strify_allowing_none(sa)
-                                for sa in t)
-
-                shape_mismatch_msg = (
-                        "raise TypeError(\"shape mismatch on argument '%s' "
-                        "(got: %%s, expected: %%s)\" "
-                        "%% (%s.shape, %s))"
-                        % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
-
-                if kernel_arg.shape is None:
-                    pass
-
-                elif any(shape_axis is None for shape_axis in kernel_arg.shape):
-                    gen("if len(%s.shape) != %s:"
-                            % (arg.name, len(arg.unvec_shape)))
-                    with Indentation(gen):
-                        gen(shape_mismatch_msg)
-
-                    for i, shape_axis in enumerate(arg.unvec_shape):
-                        if shape_axis is None:
-                            continue
-
-                        gen("if %s.shape[%d] != %s:"
-                                % (arg.name, i, strify(shape_axis)))
-                        with Indentation(gen):
-                            gen(shape_mismatch_msg)
-
-                else:  # not None, no Nones in tuple
-                    gen("if %s.shape != %s:"
-                            % (arg.name, strify(arg.unvec_shape)))
-                    with Indentation(gen):
-                        gen(shape_mismatch_msg)
-
-                # }}}
-
-                if arg.unvec_strides and kernel_arg.dim_tags:
-                    itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                    sym_strides = tuple(
-                            itemsize*s_i for s_i in arg.unvec_strides)
-                    gen("if %s.strides != %s:"
-                            % (arg.name, strify(sym_strides)))
-                    with Indentation(gen):
-                        gen("raise TypeError(\"strides mismatch on "
-                                "argument '%s' (got: %%s, expected: %%s)\" "
-                                "%% (%s.strides, %s))"
-                                % (arg.name, arg.name, strify(sym_strides)))
-
-                if not arg.allows_offset:
-                    gen("if %s.offset:" % arg.name)
-                    with Indentation(gen):
-                        gen("raise ValueError(\"Argument '%s' does not "
-                                "allow arrays with offsets. Try passing "
-                                "default_offset=loopy.auto to make_kernel()."
-                                "\")" % arg.name)
-                        gen("")
-
-        # }}}
-
-        if possibly_made_by_loopy and not options.skip_arg_checks:
-            gen("del _lpy_made_by_loopy")
+    def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks):
+        """
+        Handle allocation of non-specified arguements for pyopencl execution
+        """
+        from pymbolic import var
+
+        num_axes = len(arg.strides)
+        for i in range(num_axes):
+            gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
+
+        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+        for i in range(num_axes):
+            gen("_lpy_strides_%d = %s" % (i, strify(
+                itemsize*arg.unvec_strides[i])))
+
+        if not skip_arg_checks:
+            for i in range(num_axes):
+                gen("assert _lpy_strides_%d > 0, "
+                        "\"'%s' has negative stride in axis %d\""
+                        % (i, arg.name, i))
+
+        sym_strides = tuple(
+                var("_lpy_strides_%d" % i)
+                for i in range(num_axes))
+        sym_shape = tuple(
+                var("_lpy_shape_%d" % i)
+                for i in range(num_axes))
+
+        alloc_size_expr = (sum(astrd*(alen-1)
+            for alen, astrd in zip(sym_shape, sym_strides))
+            + itemsize)
+
+        gen("_lpy_alloc_size = %s" % strify(alloc_size_expr))
+        gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, "
+                "%(dtype)s, strides=%(strides)s, "
+                "data=allocator(_lpy_alloc_size), allocator=allocator)"
+                % dict(
+                    name=arg.name,
+                    shape=strify(sym_shape),
+                    strides=strify(sym_strides),
+                    dtype=self.python_dtype_str(kernel_arg.dtype.numpy_dtype)))
+
+        if not skip_arg_checks:
+            for i in range(num_axes):
+                gen("del _lpy_shape_%d" % i)
+                gen("del _lpy_strides_%d" % i)
+            gen("del _lpy_alloc_size")
             gen("")
 
-        if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-            args.append("%s.base_data" % arg.name)
-        else:
-            args.append("%s" % arg.name)
+    # }}}
 
-        gen("")
+    def target_specific_preamble(self, gen):
+        """
+        Add default pyopencl imports to preamble
+        """
+        gen.add_to_preamble("import numpy as _lpy_np")
+        gen.add_to_preamble("import pyopencl as _lpy_cl")
+        gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
+        gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
 
-        gen("# }}}")
+    def initialize_system_args(self, gen):
+        """
+        Initializes possibly empty system arguements
+        """
+        gen("if allocator is None:")
+        with Indentation(gen):
+            gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
         gen("")
 
-    gen("# }}}")
-    gen("")
-
-    return args
-
-# }}}
-
-
-def generate_invoker(kernel, codegen_result):
-    options = kernel.options
-    implemented_data_info = codegen_result.implemented_data_info
-    host_code = codegen_result.host_code()
-
-    system_args = [
-            "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None",
-            # ignored if options.no_numpy
-            "out_host=None"
-            ]
-
-    from loopy.kernel.data import KernelArgument
-    gen = PythonFunctionGenerator(
-            "invoke_%s_loopy_kernel" % kernel.name,
-            system_args + [
-                "%s=None" % idi.name
-                for idi in implemented_data_info
-                if issubclass(idi.arg_class, KernelArgument)
-                ])
-
-    gen.add_to_preamble("from __future__ import division")
-    gen.add_to_preamble("")
-    gen.add_to_preamble("import pyopencl as _lpy_cl")
-    gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
-    gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
-    gen.add_to_preamble("import numpy as _lpy_np")
-    gen.add_to_preamble("")
-    gen.add_to_preamble(host_code)
-    gen.add_to_preamble("")
-
-    gen("if allocator is None:")
-    with Indentation(gen):
-        gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)")
-    gen("")
-
-    generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info)
-    generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info)
-    generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info)
-    generate_value_arg_check(gen, kernel, implemented_data_info)
-
-    args = generate_arg_setup(gen, kernel, implemented_data_info, options)
-
     # {{{ generate invocation
 
-    gen("_lpy_evt = {kernel_name}({args})"
+    def generate_invocation(self, gen, kernel_name, args):
+        gen("for knl in _lpy_cl_kernels:")
+        with Indentation(gen):
+            gen("_lpy_evt = {kernel_name}({args})"
             .format(
-                kernel_name=codegen_result.host_program.name,
+                kernel_name=kernel_name,
                 args=", ".join(
                     ["_lpy_cl_kernels", "queue"]
                     + args
@@ -551,72 +162,52 @@ def generate_invoker(kernel, codegen_result):
 
     # }}}
 
-    # {{{ output
+    # {{{
 
-    if not options.no_numpy:
-        gen("if out_host is None and (_lpy_encountered_numpy "
-                "and not _lpy_encountered_dev):")
-        with Indentation(gen):
-            gen("out_host = True")
+    def generate_output_handler(
+            self, gen, options, kernel, implemented_data_info):
 
-        gen("if out_host:")
-        with Indentation(gen):
-            gen("pass")  # if no outputs (?!)
-            for arg in implemented_data_info:
-                if not issubclass(arg.arg_class, KernelArgument):
-                    continue
+        from loopy.kernel.data import KernelArgument
 
-                is_written = arg.base_name in kernel.get_written_variables()
-                if is_written:
-                    gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
-
-        gen("")
-
-    if options.return_dict:
-        gen("return _lpy_evt, {%s}"
-                % ", ".join("\"%s\": %s" % (arg.name, arg.name)
-                    for arg in implemented_data_info
-                    if issubclass(arg.arg_class, KernelArgument)
-                    if arg.base_name in kernel.get_written_variables()))
-    else:
-        out_args = [arg
-                for arg in implemented_data_info
-                    if issubclass(arg.arg_class, KernelArgument)
-                if arg.base_name in kernel.get_written_variables()]
-        if out_args:
-            gen("return _lpy_evt, (%s,)"
-                    % ", ".join(arg.name for arg in out_args))
-        else:
-            gen("return _lpy_evt, ()")
+        if not options.no_numpy:
+            gen("if out_host is None and (_lpy_encountered_numpy "
+                    "and not _lpy_encountered_dev):")
+            with Indentation(gen):
+                gen("out_host = True")
 
-    # }}}
+            gen("if out_host:")
+            with Indentation(gen):
+                gen("pass")  # if no outputs (?!)
+                for arg in implemented_data_info:
+                    if not issubclass(arg.arg_class, KernelArgument):
+                        continue
 
-    if options.write_wrapper:
-        output = gen.get()
-        if options.highlight_wrapper:
-            output = get_highlighted_python_code(output)
+            gen("")
 
-        if options.write_wrapper is True:
-            print(output)
+        if options.return_dict:
+            gen("return None, {%s}"
+                    % ", ".join("\"%s\": %s" % (arg.name, arg.name)
+                        for arg in implemented_data_info
+                        if issubclass(arg.arg_class, KernelArgument)
+                        if arg.base_name in kernel.get_written_variables()))
         else:
-            with open(options.write_wrapper, "w") as outf:
-                outf.write(output)
-
-    return gen.get_function()
+            out_args = [arg
+                    for arg in implemented_data_info
+                        if issubclass(arg.arg_class, KernelArgument)
+                    if arg.base_name in kernel.get_written_variables()]
+            if out_args:
+                gen("return None, (%s,)"
+                        % ", ".join(arg.name for arg in out_args))
+            else:
+                gen("return None, ()")
 
+    # }}}
 
 # }}}
 
 
 # {{{ kernel executor
 
-class _CLKernelInfo(ImmutableRecord):
-    pass
-
-
-class _CLKernels(object):
-    pass
-
 
 class PyOpenCLKernelExecutor(KernelExecutorBase):
     """An object connecting a kernel to a :class:`pyopencl.Context`
@@ -635,7 +226,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             specific arguments.
         """
 
-        super(PyOpenCLKernelExecutor, self).__init__(kernel)
+        super(PyOpenCLKernelExecutor, self).__init__(
+            kernel, invoker=PyOpenCLExecutionWrapperGenerator())
 
         self.context = context
 
@@ -644,10 +236,11 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
 
     @memoize_method
-    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code_v2
+        from loopy.execution import get_highlighted_code
         codegen_result = generate_code_v2(kernel)
 
         dev_code = codegen_result.device_code()
@@ -655,7 +248,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         if self.kernel.options.write_cl:
             output = dev_code
             if self.kernel.options.highlight_cl:
-                output = get_highlighted_cl_code(output)
+                output = get_highlighted_code(output)
 
             if self.kernel.options.write_cl is True:
                 print(output)
@@ -673,33 +266,15 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
                 cl.Program(self.context, dev_code)
                 .build(options=kernel.options.cl_build_options))
 
-        cl_kernels = _CLKernels()
+        cl_kernels = _Kernels()
         for dp in codegen_result.device_programs:
             setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
 
-        return _CLKernelInfo(
+        return _KernelInfo(
                 kernel=kernel,
                 cl_kernels=cl_kernels,
                 implemented_data_info=codegen_result.implemented_data_info,
-                invoker=generate_invoker(kernel, codegen_result))
-
-    # {{{ debugging aids
-
-    def get_code(self, arg_to_dtype=None):
-        if arg_to_dtype is not None:
-            arg_to_dtype = frozenset(six.iteritems(arg_to_dtype))
-
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
-
-        from loopy.codegen import generate_code_v2
-        code = generate_code_v2(kernel)
-        return code.device_code()
-
-    def get_highlighted_code(self, arg_to_dtype=None):
-        return get_highlighted_cl_code(
-                self.get_code(arg_to_dtype))
-
-    # }}}
+                invoker=self.invoker(kernel, codegen_result))
 
     def __call__(self, queue, **kwargs):
         """
@@ -742,29 +317,4 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
 # }}}
 
-
-def get_highlighted_python_code(text):
-    try:
-        from pygments import highlight
-    except ImportError:
-        return text
-    else:
-        from pygments.lexers import PythonLexer
-        from pygments.formatters import TerminalFormatter
-
-        return highlight(text, PythonLexer(), TerminalFormatter())
-
-
-def get_highlighted_cl_code(text):
-    try:
-        from pygments import highlight
-    except ImportError:
-        return text
-    else:
-        from pygments.lexers import CLexer
-        from pygments.formatters import TerminalFormatter
-
-        return highlight(text, CLexer(), TerminalFormatter())
-
-
 # vim: foldmethod=marker
-- 
GitLab