From 9ca753fed36d7af22715b758ef86c2ae92c1839a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 6 Feb 2014 17:59:44 -0600
Subject: [PATCH] Don't store the target CL device in the kernel object

---
 doc/reference.rst         |   2 +
 doc/tutorial.rst          |  31 ++++----
 examples/hello-loopy.py   |   2 +-
 loopy/__init__.py         |   4 +-
 loopy/auto_test.py        |  15 ++--
 loopy/check.py            |  23 +++---
 loopy/codegen/__init__.py |   9 ++-
 loopy/compiled.py         |  10 +--
 loopy/kernel/__init__.py  |  58 ++++++++++-----
 loopy/kernel/creation.py  |   5 +-
 loopy/preprocess.py       |  30 +++++---
 loopy/schedule.py         |  12 ++--
 test/test_dg.py           |   4 +-
 test/test_linalg.py       |  28 ++++----
 test/test_loopy.py        | 147 ++++++++++++++++++--------------------
 test/test_nbody.py        |  20 +++---
 test/test_sem_reagan.py   |  20 +++---
 17 files changed, 236 insertions(+), 184 deletions(-)

diff --git a/doc/reference.rst b/doc/reference.rst
index 164a44b5e..fab13029c 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -426,6 +426,8 @@ Finishing up
 
 .. autofunction:: generate_loop_schedules
 
+.. autofunction:: get_one_scheduled_kernel
+
 .. autofunction:: generate_code
 
 Running
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 72c6c5b0c..7785a4259 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -52,7 +52,7 @@ one vector, doubles it, and writes it to another.
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     "out[i] = 2*a[i]")
 
@@ -80,9 +80,6 @@ The parts that you see here are the two main components of a loopy kernel:
   See :ref:`expression-syntax` for a full list of allowed constructs in the
   left- and right-hand side expression of an assignment.
 
-Loopy also needs to know which OpenCL device to target.  ``ctx.devices[0]``
-specifies the first device in our OpenCL context.
-
 As you create and transform kernels, it's useful to know that you can
 always see loopy's view of a kernel by printing it.
 
@@ -234,6 +231,8 @@ call :func:`loopy.generate_code`:
 .. doctest::
 
     >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
+    >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0])
+    >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl)
     >>> code, _ = lp.generate_code(typed_knl)
     >>> print code
     <BLANKLINE>
@@ -257,7 +256,7 @@ argument:
 .. doctest::
 
     >>> # WARNING: Incorrect.
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i,j]: 0<=i,j<n }",
     ...     """
     ...     out[j,i] = a[i,j]
@@ -284,7 +283,7 @@ an explicit dependency:
 .. doctest::
 
     >>> # WARNING: Incorrect.
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i,j]: 0<=i,j<n }",
     ...     """
     ...     out[j,i] = a[i,j] {id=transpose}
@@ -384,7 +383,7 @@ with identical bounds, for the use of the transpose:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i,j,ii,jj]: 0<=i,j,ii,jj<n }",
     ...     """
     ...     out[j,i] = a[i,j] {id=transpose}
@@ -429,7 +428,7 @@ zero-fill kernel?
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i,j]: 0<=i,j<n }",
     ...     """
     ...     a[i,j] = 0
@@ -514,7 +513,7 @@ Consider this example:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     "a[i] = 0", assumptions="n>=0")
     >>> knl = lp.split_iname(knl, "i", 16)
@@ -564,7 +563,7 @@ commonly called 'loop tiling':
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i,j]: 0<=i,j<n }",
     ...     "out[i,j] = a[j,i]",
     ...     assumptions="n mod 16 = 0 and n >= 1")
@@ -604,7 +603,7 @@ loop's tag to ``"unr"``:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     "a[i] = 0", assumptions="n>=0 and n mod 4 = 0")
     >>> orig_knl = knl
@@ -679,7 +678,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     "a[i] = 0", assumptions="n>=0")
     >>> knl = lp.split_iname(knl, "i", 128,
@@ -724,7 +723,7 @@ assumption:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     "a[i] = 0", assumptions="n>=0")
     >>> orig_knl = knl
@@ -821,7 +820,7 @@ Attempting to create this kernel results in an error:
 
 .. doctest::
 
-    >>> lp.make_kernel(ctx.devices[0],
+    >>> lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
     ...     """
     ...     out[i] = 5
@@ -848,7 +847,7 @@ be told in order for the error to disappear--note the *assumptions* argument:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...      "{ [i]: 0<=i<n }",
     ...      """
     ...      out[i] = 5
@@ -868,7 +867,7 @@ This kernel performs a simple transposition of an input matrix:
 
 .. doctest::
 
-    >>> knl = lp.make_kernel(ctx.devices[0],
+    >>> knl = lp.make_kernel(
     ...       "{ [i,j]: 0<=i,j<n }",
     ...       """
     ...       out[j,i] = a[i,j]
diff --git a/examples/hello-loopy.py b/examples/hello-loopy.py
index c7c7ade30..efdbf1315 100644
--- a/examples/hello-loopy.py
+++ b/examples/hello-loopy.py
@@ -13,7 +13,7 @@ a = cl.array.arange(queue, n, dtype=np.float32)
 
 # create
 # ------
-knl = lp.make_kernel(ctx.devices[0],
+knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
         "out[i] = 2*a[i]")
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 51925f9b6..8934aebc6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -60,7 +60,7 @@ from loopy.padding import (split_arg_axis, find_padding_multiple,
         add_padding)
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
-from loopy.schedule import generate_loop_schedules
+from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.codegen import generate_code
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -94,7 +94,7 @@ __all__ = [
         "infer_argument_dtypes", "add_and_infer_dtypes",
 
         "preprocess_kernel", "realize_reduction", "infer_unknown_types",
-        "generate_loop_schedules",
+        "generate_loop_schedules", "get_one_scheduled_kernel",
         "generate_code",
 
         "CompiledKernel",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 62797d00a..46c21ccad 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -28,6 +28,7 @@ import numpy as np
 
 import pyopencl as cl
 import pyopencl.array as cl_array
+import loopy as lp
 
 
 AUTO_TEST_SKIP_RUN = False
@@ -36,7 +37,6 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-
 # {{{ create random argument arrays for testing
 
 def fill_rand(ary):
@@ -398,8 +398,9 @@ def auto_test_vs_ref(
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-        import loopy as lp
-        for knl in lp.generate_loop_schedules(ref_knl):
+        pp_ref_knl = lp.preprocess_kernel(ref_knl, device=dev)
+
+        for knl in lp.generate_loop_schedules(pp_ref_knl):
             ref_sched_kernel = knl
             break
 
@@ -487,6 +488,12 @@ def auto_test_vs_ref(
 
         test_kernels = test_knl
     else:
+        from loopy.kernel import kernel_state
+        if test_knl.state not in [
+                kernel_state.PREPROCESSED,
+                kernel_state.SCHEDULED]:
+            test_knl = lp.preprocess_kernel(test_knl, device=ctx.devices[0])
+
         if not test_knl.schedule:
             test_kernels = lp.generate_loop_schedules(test_knl)
         else:
@@ -604,7 +611,7 @@ def auto_test_vs_ref(
 
         print("elapsed: %g s event, %s s marker-event %g s wall "
                 "(%d rounds)%s" % (
-                elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates))
+                    elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates))
 
         if do_check:
             ref_rates = ""
diff --git a/loopy/check.py b/loopy/check.py
index 70d16c2d3..df8a61b18 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -373,7 +373,7 @@ def pre_schedule_checks(kernel):
 
 # {{{ pre-code-generation checks
 
-def check_sizes(kernel):
+def check_sizes(kernel, device):
     import loopy as lp
 
     from loopy.diagnostic import LoopyAdvisory
@@ -386,7 +386,7 @@ def check_sizes(kernel):
     glens, llens = kernel.get_grid_sizes_as_exprs()
 
     if (max(len(glens), len(llens))
-            > kernel.device.max_work_item_dimensions):
+            > device.max_work_item_dimensions):
         raise LoopyError("too many work item dimensions")
 
     from pymbolic import evaluate
@@ -401,15 +401,15 @@ def check_sizes(kernel):
                 % name, LoopyAdvisory)
     else:
         for i in range(len(llens)):
-            if llens[i] > kernel.device.max_work_item_sizes[i]:
+            if llens[i] > device.max_work_item_sizes[i]:
                 raise LoopyError("group axis %d too big" % i)
 
         from pytools import product
-        if product(llens) > kernel.device.max_work_group_size:
+        if product(llens) > device.max_work_group_size:
             raise LoopyError("work group too big")
 
     from pyopencl.characterize import usable_local_mem_size
-    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
+    if kernel.local_mem_use() > usable_local_mem_size(device):
         raise LoopyError("using too much local memory")
 
     from loopy.kernel.data import ConstantArg
@@ -417,7 +417,7 @@ def check_sizes(kernel):
             1 for arg in kernel.args
             if isinstance(arg, ConstantArg))
 
-    if const_arg_count > kernel.device.max_constant_args:
+    if const_arg_count > device.max_constant_args:
         raise LoopyError("too many constant arguments")
 
 
@@ -457,11 +457,18 @@ def check_that_shapes_and_strides_are_arguments(kernel):
                                     arg.name, ", ".join(deps-integer_arg_names)))
 
 
-def pre_codegen_checks(kernel):
+def pre_codegen_checks(kernel, device=None):
     try:
         logger.info("pre-codegen check %s: start" % kernel.name)
 
-        check_sizes(kernel)
+        if device is not None:
+            check_sizes(kernel, device)
+        else:
+            from loopy.diagnostic import warn
+            warn(kernel, "no_device_in_pre_codegen_checks",
+                    "No device parameter was passed to loopy.pre_codegen_checks. "
+                    "Perhaps you want to pass a device argument to generate_code.")
+
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.info("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index afd8ed762..c33f336cb 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -23,6 +23,7 @@ THE SOFTWARE.
 """
 
 
+from loopy.diagnostic import LoopyError
 from pytools import Record
 import islpy as isl
 
@@ -287,16 +288,20 @@ class ImplementedDataInfo(Record):
 
 # {{{ main code generation entrypoint
 
-def generate_code(kernel):
+def generate_code(kernel, device=None):
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel)
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.SCHEDULED:
+        raise LoopyError("cannot generate code for a kernel that has not been "
+                "scheduled")
 
     from loopy.preprocess import infer_unknown_types
     kernel = infer_unknown_types(kernel, expect_completion=True)
 
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel)
+    pre_codegen_checks(kernel, device=device)
 
     from cgen import (FunctionBody, FunctionDeclaration,
             Value, Module, Block,
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 47bcac8b8..bfed0c5c8 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -632,7 +632,7 @@ class _CLKernelInfo(Record):
 
 
 class CompiledKernel:
-    def __init__(self, context, kernel, codegen_kwargs={}):
+    def __init__(self, context, kernel):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
@@ -641,7 +641,6 @@ class CompiledKernel:
         """
 
         self.context = context
-        self.codegen_kwargs = codegen_kwargs
         self.kernel = kernel
 
         self.packing_controller = SeparateArrayPackingController(kernel)
@@ -676,6 +675,9 @@ class CompiledKernel:
             kernel = infer_unknown_types(kernel, expect_completion=True)
 
         if kernel.schedule is None:
+            from loopy.preprocess import preprocess_kernel
+            kernel = preprocess_kernel(kernel, self.context.devices[0])
+
             from loopy.schedule import get_one_scheduled_kernel
             kernel = get_one_scheduled_kernel(kernel)
 
@@ -686,7 +688,7 @@ class CompiledKernel:
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code
-        code, impl_arg_info = generate_code(kernel, **self.codegen_kwargs)
+        code, impl_arg_info = generate_code(kernel, device=self.context.devices[0])
 
         if self.kernel.options.write_cl:
             output = code
@@ -724,7 +726,7 @@ class CompiledKernel:
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
 
         from loopy.codegen import generate_code
-        code, arg_info = generate_code(kernel, **self.codegen_kwargs)
+        code, arg_info = generate_code(kernel, device=self.context.devices[0])
         return code
 
     def get_highlighted_code(self, arg_to_dtype=None):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index c6f3abc14..81a64484c 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -26,12 +26,12 @@ THE SOFTWARE.
 
 
 import numpy as np
-from pytools import Record, memoize_method
+from pytools import RecordWithoutPickling, memoize_method
 import islpy as isl
 from islpy import dim_type
 import re
 
-from pytools import UniqueNameGenerator, generate_unique_possibilities
+from pytools import UniqueNameGenerator, generate_unique_names
 
 from loopy.library.function import (
         default_function_mangler,
@@ -79,13 +79,15 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
 
 # {{{ loop kernel object
 
-class LoopKernel(Record):
-    """These correspond more or less directly to arguments of
-    :func:`loopy.make_kernel`.
+class kernel_state:
+    INITIAL = 0
+    PREPROCESSED = 1
+    SCHEDULED = 2
 
-    .. attribute:: device
 
-        :class:`pyopencl.Device`
+class LoopKernel(RecordWithoutPickling):
+    """These correspond more or less directly to arguments of
+    :func:`loopy.make_kernel`.
 
     .. attribute:: domains
 
@@ -137,11 +139,15 @@ class LoopKernel(Record):
     .. attribute:: options
 
         An instance of :class:`loopy.Options`
+
+    .. attribute:: state
+
+        A value from :class:`kernel_state`.
     """
 
     # {{{ constructor
 
-    def __init__(self, device, domains, instructions, args=[], schedule=None,
+    def __init__(self, domains, instructions, args=[], schedule=None,
             name="loopy_kernel",
             preambles=[],
             preamble_generators=[default_preamble_generator],
@@ -167,6 +173,8 @@ class LoopKernel(Record):
             isl_context=None,
             options=None,
 
+            state=kernel_state.INITIAL,
+
             # When kernels get intersected in slab decomposition,
             # their grid sizes shouldn't change. This provides
             # a way to forward sub-kernel grid size requests.
@@ -247,8 +255,15 @@ class LoopKernel(Record):
             # overwrites method down below
             self.get_grid_sizes = get_grid_sizes
 
-        Record.__init__(self,
-                device=device, domains=domains,
+        if state not in [
+                kernel_state.INITIAL,
+                kernel_state.PREPROCESSED,
+                kernel_state.SCHEDULED,
+                ]:
+            raise ValueError("invalid value for 'state'")
+
+        RecordWithoutPickling.__init__(self,
+                domains=domains,
                 instructions=instructions,
                 args=args,
                 schedule=schedule,
@@ -269,7 +284,8 @@ class LoopKernel(Record):
                 symbol_manglers=symbol_manglers,
                 index_dtype=index_dtype,
                 isl_context=isl_context,
-                options=options)
+                options=options,
+                state=state)
 
     # }}}
 
@@ -310,7 +326,7 @@ class LoopKernel(Record):
 
         used_ids = set(insn.id for insn in insns) | extra_used_ids
 
-        for id_str in generate_unique_possibilities(based_on):
+        for id_str in generate_unique_names(based_on):
             if id_str not in used_ids:
                 return id_str
 
@@ -745,7 +761,7 @@ class LoopKernel(Record):
                     dom_intersect_assumptions, iname_idx)
                 .coalesce())
 
-        class BoundsRecord(Record):
+        class BoundsRecord(RecordWithoutPickling):
             pass
 
         size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
@@ -814,8 +830,6 @@ class LoopKernel(Record):
 
             tgt_dict[tag.axis] = size
 
-        max_dims = self.device.max_work_item_dimensions
-
         def to_dim_tuple(size_dict, which, forced_sizes={}):
             forced_sizes = forced_sizes.copy()
 
@@ -840,10 +854,6 @@ class LoopKernel(Record):
 
                 size_list.append(size_dict[cur_axis])
 
-            if len(size_list) > max_dims:
-                raise ValueError("more %s dimensions assigned than supported "
-                        "by hardware (%d > %d)" % (which, len(size_list), max_dims))
-
             return tuple(size_list)
 
         return (to_dim_tuple(global_sizes, "global"),
@@ -1016,6 +1026,16 @@ class LoopKernel(Record):
 
     # }}}
 
+    def __getinitargs__(self):
+        result = dict(
+                (key, getattr(self, key))
+                for key in self.__class__.fields
+                if hasattr(self, key))
+
+        result.pop("cache_manager", None)
+
+        return result
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 1d5c3d668..e3f7ab14e 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -935,10 +935,9 @@ def resolve_wildcard_deps(knl):
 
 # {{{ kernel creation top-level
 
-def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs):
+def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     """User-facing kernel creation entrypoint.
 
-    :arg device: :class:`pyopencl.Device`
     :arg domains: :class:`islpy.BasicSet`
     :arg instructions:
     :arg kernel_data:
@@ -1101,7 +1100,7 @@ def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs):
     kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args)
 
     from loopy.kernel import LoopKernel
-    knl = LoopKernel(device, domains, instructions, kernel_args,
+    knl = LoopKernel(domains, instructions, kernel_args,
             temporary_variables=temporary_variables,
             silenced_warnings=silenced_warnings,
             options=options,
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ec1e750d3..60643f559 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -985,12 +985,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
 # {{{ temp storage adjust for bank conflict
 
-def adjust_local_temp_var_storage(kernel):
+def adjust_local_temp_var_storage(kernel, device):
     logger.debug("%s: adjust temp var storage" % kernel.name)
 
     new_temp_vars = {}
 
-    lmem_size = cl_char.usable_local_mem_size(kernel.device)
+    lmem_size = cl_char.usable_local_mem_size(device)
     for temp_var in kernel.temporary_variables.itervalues():
         if not temp_var.is_local:
             new_temp_vars[temp_var.name] = \
@@ -1013,12 +1013,12 @@ def adjust_local_temp_var_storage(kernel):
         # below to avoid bank conflicts
         from pytools import product
 
-        if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL:
+        if device.local_mem_type == cl.device_local_mem_type.GLOBAL:
             # FIXME: could try to avoid cache associativity disasters
             new_storage_shape = storage_shape
 
-        elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL:
-            min_mult = cl_char.local_memory_bank_count(kernel.device)
+        elif device.local_mem_type == cl.device_local_mem_type.LOCAL:
+            min_mult = cl_char.local_memory_bank_count(device)
             good_incr = None
             new_storage_shape = storage_shape
             min_why_not = None
@@ -1028,7 +1028,7 @@ def adjust_local_temp_var_storage(kernel):
                 test_storage_shape = storage_shape[:]
                 test_storage_shape[-1] = test_storage_shape[-1] + increment
                 new_mult, why_not = cl_char.why_not_local_access_conflict_free(
-                        kernel.device, temp_var.dtype.itemsize,
+                        device, temp_var.dtype.itemsize,
                         temp_var.shape, test_storage_shape)
 
                 # will choose smallest increment 'automatically'
@@ -1062,7 +1062,12 @@ def adjust_local_temp_var_storage(kernel):
 # }}}
 
 
-def preprocess_kernel(kernel):
+def preprocess_kernel(kernel, device=None):
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.INITIAL:
+        raise LoopyError("cannot re-preprocess an already preprocessed "
+                "kernel")
+
     logger.info("%s: preprocess start" % kernel.name)
 
     from loopy.subst import expand_subst
@@ -1096,11 +1101,18 @@ def preprocess_kernel(kernel):
     kernel = assign_automatic_axes(kernel)
     kernel = find_boostability(kernel)
     kernel = limit_boostability(kernel)
-    kernel = adjust_local_temp_var_storage(kernel)
+
+    if device is not None:
+        kernel = adjust_local_temp_var_storage(kernel, device)
+    else:
+        from loopy.diagnostic import warn
+        warn(kernel, "no_device_in_preprocess",
+                "no device parameter was passed to loopy.preprocess")
 
     logger.info("%s: preprocess done" % kernel.name)
 
-    return kernel
+    return kernel.copy(
+            state=kernel_state.PREPROCESSED)
 
 
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 697322f11..419672a96 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -999,10 +999,12 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
 # {{{ main scheduling entrypoint
 
 def generate_loop_schedules(kernel, debug_args={}):
-    loop_priority = kernel.loop_priority
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.PREPROCESSED:
+        raise LoopyError("cannot schedule a kernel that has not been "
+                "preprocessed")
 
-    from loopy.preprocess import preprocess_kernel
-    kernel = preprocess_kernel(kernel)
+    loop_priority = kernel.loop_priority
 
     from loopy.check import pre_schedule_checks
     pre_schedule_checks(kernel)
@@ -1049,7 +1051,9 @@ def generate_loop_schedules(kernel, debug_args={}):
                     reverse=False, kind="local")
 
             debug.stop()
-            yield kernel.copy(schedule=gen_sched)
+            yield kernel.copy(
+                    schedule=gen_sched,
+                    state=kernel_state.SCHEDULED)
             debug.start()
 
             schedule_count += 1
diff --git a/test/test_dg.py b/test/test_dg.py
index 956bee2d7..291da4484 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -47,7 +47,7 @@ def test_dg_volume(ctx_factory):
 
     K = 10000
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel([
             "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
             ],
             """
@@ -175,7 +175,7 @@ def no_test_dg_surface(ctx_factory):
 
     K = 10000
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[m,n,k]: 0<= m < NfpNfaces and 0<= n < Np and 0<= k < K }"
                 ],
diff --git a/test/test_linalg.py b/test/test_linalg.py
index ef0d0c0ca..47c7600d6 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -73,7 +73,7 @@ def test_axpy(ctx_factory):
                 vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)),
             (np.float32, None, 5, 7),
             ]:
-        knl = lp.make_kernel(ctx.devices[0],
+        knl = lp.make_kernel(
                 "[n] -> {[i]: 0<=i<n}",
                 [
                     "z[i] = a*x[i]+b*y[i]"
@@ -121,7 +121,7 @@ def test_transpose(ctx_factory):
 
     n = get_suitable_size(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<%d}" % n,
             [
                 "b[i, j] = a[j, i]"
@@ -155,7 +155,7 @@ def test_plain_matrix_mul(ctx_factory):
             (cl_array.vec.float4, check_float4, 4),
             (np.float32, None, 1),
             ]:
-        knl = lp.make_kernel(ctx.devices[0],
+        knl = lp.make_kernel(
                 "{[i,j,k]: 0<=i,j,k<%d}" % n,
                 [
                     "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -189,7 +189,7 @@ def test_variable_size_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[n] -> {[i,j,k]: 0<=i,j,k<n}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j]) {id=labl}"
@@ -226,7 +226,7 @@ def test_rank_one(ctx_factory):
     #n = int(get_suitable_size(ctx)**(2.7/2))
     n = 16**3
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[n] -> {[i,j]: 0<=i,j<n}",
             [
                 "c[i, j] = a[i]*b[j] {id=mylabel, priority =5}"
@@ -303,7 +303,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
 
     n = 6*16*2
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -340,7 +340,7 @@ def test_intel_matrix_mul(ctx_factory):
 
     n = 128+32
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -392,7 +392,7 @@ def test_magma_fermi_matrix_mul(ctx_factory):
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
         pytest.skip("image format not supported")
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -439,7 +439,7 @@ def test_image_matrix_mul(ctx_factory):
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
         pytest.skip("image format not supported")
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -477,7 +477,7 @@ def test_image_matrix_mul_ilp(ctx_factory):
 
     n = get_suitable_size(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -510,12 +510,11 @@ def test_image_matrix_mul_ilp(ctx_factory):
 @pytest.mark.skipif("sys.version_info < (2,6)")
 def test_ilp_race_matmul(ctx_factory):
     dtype = np.float32
-    ctx = ctx_factory()
     order = "C"
 
     n = 9
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -534,6 +533,7 @@ def test_ilp_race_matmul(ctx_factory):
     from loopy.diagnostic import WriteRaceConditionWarning
     from warnings import catch_warnings
     with catch_warnings(record=True) as warn_list:
+        knl = lp.preprocess_kernel(knl)
         list(lp.generate_loop_schedules(knl))
 
         assert any(isinstance(w.message, WriteRaceConditionWarning)
@@ -548,7 +548,7 @@ def test_fancy_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[n] -> {[i,j,k]: 0<=i,j,k<n }",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
@@ -582,7 +582,7 @@ def test_small_batched_matvec(ctx_factory):
     K = 9997
     Np = 36
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np,
             [
                 "result[k, i] = sum(j, d[i, j]*f[k, j])"
diff --git a/test/test_loopy.py b/test/test_loopy.py
index b19df76b4..07a737900 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -52,7 +52,7 @@ __all__ = [
 def test_complicated_subst(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 f(x) := x*a[x]
@@ -84,7 +84,7 @@ def test_complicated_subst(ctx_factory):
 def test_type_inference_no_artificial_doubles(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 <> bb = a[i] - b[i]
@@ -98,6 +98,7 @@ def test_type_inference_no_artificial_doubles(ctx_factory):
                 ],
             assumptions="n>=1")
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     for k in lp.generate_loop_schedules(knl):
         code = lp.generate_code(k)
         assert "double" not in code
@@ -106,7 +107,7 @@ def test_type_inference_no_artificial_doubles(ctx_factory):
 def test_sized_and_complex_literals(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 <> aa = 5jf
@@ -129,7 +130,7 @@ def test_sized_and_complex_literals(ctx_factory):
 def test_simple_side_effect(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<100}",
             """
                 a[i] = a[i] + 1
@@ -137,6 +138,7 @@ def test_simple_side_effect(ctx_factory):
             [lp.GlobalArg("a", np.float32, shape=(100,))]
             )
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     kernel_gen = lp.generate_loop_schedules(knl)
 
     for gen_knl in kernel_gen:
@@ -148,7 +150,7 @@ def test_simple_side_effect(ctx_factory):
 def test_nonsense_reduction(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
             """
                 a[i] = sum(i, 2)
@@ -158,13 +160,13 @@ def test_nonsense_reduction(ctx_factory):
 
     import pytest
     with pytest.raises(RuntimeError):
-        list(lp.generate_loop_schedules(knl))
+        knl = lp.preprocess_kernel(knl, ctx.devices[0])
 
 
 def test_owed_barriers(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
             [
                 "<float32> z[i] = a[i]"
@@ -174,6 +176,7 @@ def test_owed_barriers(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     kernel_gen = lp.generate_loop_schedules(knl)
 
     for gen_knl in kernel_gen:
@@ -184,7 +187,7 @@ def test_owed_barriers(ctx_factory):
 def test_wg_too_small(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
             [
                 "<float32> z[i] = a[i] {id=copy}"
@@ -194,6 +197,7 @@ def test_wg_too_small(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     kernel_gen = lp.generate_loop_schedules(knl)
 
     import pytest
@@ -205,7 +209,7 @@ def test_wg_too_small(ctx_factory):
 def test_join_inames(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<16}",
             [
                 "b[i,j] = 2*a[i,j]"
@@ -227,7 +231,7 @@ def test_join_inames(ctx_factory):
 def test_divisibility_assumption(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[n] -> {[i]: 0<=i<n}",
             [
                 "b[i] = 2*a[i]"
@@ -243,6 +247,7 @@ def test_divisibility_assumption(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 16)
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     for k in lp.generate_loop_schedules(knl):
         code = lp.generate_code(k)
         assert "if" not in code
@@ -254,7 +259,7 @@ def test_divisibility_assumption(ctx_factory):
 def test_multi_cse(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
             [
                 "<float32> z[i] = a[i] + a[i]**2"
@@ -265,6 +270,7 @@ def test_multi_cse(ctx_factory):
     knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", [])
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     kernel_gen = lp.generate_loop_schedules(knl)
 
     for gen_knl in kernel_gen:
@@ -279,7 +285,7 @@ def test_stencil(ctx_factory):
     # non-unifiable, two-constant-segments PwAff as the base index)
 
     n = 256
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<= i,j < %d}" % n,
             [
                 "a_offset(ii, jj) := a[ii+1, jj+1]",
@@ -320,7 +326,7 @@ def test_stencil(ctx_factory):
 def test_stencil_with_overfetch(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<= i,j < n}",
             [
                 "a_offset(ii, jj) := a[ii+2, jj+2]",
@@ -361,7 +367,7 @@ def test_stencil_with_overfetch(ctx_factory):
 def test_eq_constraint(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<= i,j < 32}",
             [
                 "a[i] = b[i]"
@@ -374,6 +380,7 @@ def test_eq_constraint(ctx_factory):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
     knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     kernel_gen = lp.generate_loop_schedules(knl)
 
     for knl in kernel_gen:
@@ -388,7 +395,7 @@ def test_argmax(ctx_factory):
 
     n = 10000
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i]: 0<=i<%d}" % n,
             [
                 "<> result = argmax(i, fabs(a[i]))",
@@ -489,7 +496,7 @@ def test_fuzz_code_generator(ctx_factory):
             else:
                 return np.float64
 
-        knl = lp.make_kernel(ctx.devices[0], "{ : }",
+        knl = lp.make_kernel("{ : }",
                 [lp.ExpressionInstruction("value", expr)],
                 [lp.GlobalArg("value", np.complex128, shape=())]
                 + [
@@ -523,7 +530,7 @@ def test_empty_reduction(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<20}",
                 "[i] -> {[j]: 0<=j<0}"
@@ -546,7 +553,7 @@ def test_nested_dependent_reduction(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<n}",
                 "{[j]: 0<=j<i+sumlen}"
@@ -575,7 +582,7 @@ def test_multi_nested_dependent_reduction(ctx_factory):
     dtype = np.dtype(np.int32)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[itgt]: 0 <= itgt < ntgts}",
                 "{[isrc_box]: 0 <= isrc_box < nboxes}",
@@ -603,7 +610,7 @@ def test_recursive_nested_dependent_reduction(ctx_factory):
     dtype = np.dtype(np.int32)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[itgt]: 0 <= itgt < ntgts}",
                 "{[isrc_box]: 0 <= isrc_box < nboxes}",
@@ -632,7 +639,7 @@ def test_dependent_loop_bounds(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<n}",
                 "{[jj]: 0<=jj<row_len}",
@@ -660,7 +667,7 @@ def test_dependent_loop_bounds_2(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<n}",
                 "{[jj]: 0<=jj<row_len}",
@@ -696,7 +703,7 @@ def test_dependent_loop_bounds_3(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<n}",
                 "{[jj]: 0<=jj<row_len}",
@@ -724,6 +731,8 @@ def test_dependent_loop_bounds_3(ctx_factory):
     knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
             inner_tag="l.1")
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+
     import pytest
     with pytest.raises(RuntimeError):
         list(lp.generate_loop_schedules(knl_bad))
@@ -734,7 +743,7 @@ def test_independent_multi_domain(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "{[i]: 0<=i<n}",
                 "{[j]: 0<=j<n}",
@@ -770,7 +779,7 @@ def test_bare_data_dependency(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             [
                 "[znirp] -> {[i]: 0<=i<znirp}",
                 ],
@@ -799,7 +808,7 @@ def test_equality_constraints(ctx_factory):
 
     n = 10
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel([
             "[n] -> {[i,j]: 0<=i,j<n }",
             "{[k]: k =i+5 and k < n}",
             ],
@@ -833,7 +842,7 @@ def test_stride(ctx_factory):
 
     n = 10
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel([
             "{[i]: 0<=i<n and (exists l: i = 2*l)}",
             ],
             [
@@ -859,7 +868,7 @@ def test_domain_dependency_via_existentially_quantified_variable(ctx_factory):
 
     n = 10
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel([
             "{[i]: 0<=i<n }",
             "{[k]: k=i and (exists l: k = 2*l) }",
             ],
@@ -886,9 +895,8 @@ def test_double_sum(ctx_factory):
 
     n = 20
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             [
                 "a = sum((i,j), i*j)",
                 "b = sum(i, sum(j, i*j))",
@@ -910,9 +918,8 @@ def test_double_sum(ctx_factory):
 def test_ilp_write_race_detection_global(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "[n] -> {[i,j]: 0<=i,j<n }",
-            ],
             [
                 "a[i] = 5+i+j",
                 ],
@@ -924,6 +931,8 @@ def test_ilp_write_race_detection_global(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+
     from loopy.diagnostic import WriteRaceConditionWarning
     from warnings import catch_warnings
     with catch_warnings(record=True) as warn_list:
@@ -936,7 +945,7 @@ def test_ilp_write_race_detection_global(ctx_factory):
 def test_ilp_write_race_avoidance_local(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i<16 and 0<=j<17 }",
             [
                 "<> a[i] = 5+i+j",
@@ -945,6 +954,7 @@ def test_ilp_write_race_avoidance_local(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     for k in lp.generate_loop_schedules(knl):
         assert k.temporary_variables["a"].shape == (16, 17)
 
@@ -952,7 +962,7 @@ def test_ilp_write_race_avoidance_local(ctx_factory):
 def test_ilp_write_race_avoidance_private(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[j]: 0<=j<16 }",
             [
                 "<> a = 5+j",
@@ -961,6 +971,7 @@ def test_ilp_write_race_avoidance_private(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
     for k in lp.generate_loop_schedules(knl):
         assert k.temporary_variables["a"].shape == (16,)
 
@@ -971,9 +982,8 @@ def test_write_parameter(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             """
                 a = sum((i,j), i*j)
                 b = sum(i, sum(j, i*j))
@@ -996,9 +1006,8 @@ def test_write_parameter(ctx_factory):
 def test_arg_shape_guessing(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             """
                 a = 1.5 + sum((i,j), i*j)
                 b[i, j] = i*j
@@ -1019,9 +1028,8 @@ def test_arg_shape_guessing(ctx_factory):
 def test_arg_guessing(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             """
                 a = 1.5 + sum((i,j), i*j)
                 b[i, j] = i*j
@@ -1037,9 +1045,8 @@ def test_arg_guessing_with_reduction(ctx_factory):
     #logging.basicConfig(level=logging.DEBUG)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             """
                 a = 1.5 + sum((i,j), i*j)
                 d = 1.5 + sum((i,j), b[i,j])
@@ -1057,9 +1064,8 @@ def test_arg_guessing_with_reduction(ctx_factory):
 def test_nonlinear_index(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             """
                 a[i*i] = 17
                 """,
@@ -1076,9 +1082,8 @@ def test_nonlinear_index(ctx_factory):
 def test_triangle_domain(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n and i <= j}",
-            ],
             "a[i,j] = 17",
             assumptions="n>=1")
 
@@ -1092,9 +1097,8 @@ def test_offsets_and_slicing(ctx_factory):
 
     n = 20
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i<n and 0<=j<m }",
-            ],
             """
                 b[i,j] = 2*a[i,j]
                 """,
@@ -1128,7 +1132,7 @@ def test_offsets_and_slicing(ctx_factory):
 def test_vector_ilp_with_prefetch(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{ [i]: 0<=i<n }",
             "out[i] = 2*a[i]",
             [
@@ -1157,7 +1161,7 @@ def test_convolution(ctx_factory):
 
     dtype = np.float32
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
         "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                 -f_w <= f_x,f_y <= f_w \
                 and 0 <= im_x < im_w and 0 <= im_y < im_h \
@@ -1224,7 +1228,7 @@ def test_convolution_with_nonzero_base(ctx_factory):
 
     dtype = np.float32
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
         "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                 -f_w <= f_x,f_y <= f_w \
                 and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \
@@ -1276,9 +1280,8 @@ def test_c_instruction(ctx_factory):
     #logging.basicConfig(level=logging.DEBUG)
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
-            ],
             [
                 lp.CInstruction("i", """
                     x = sin((float) i);
@@ -1301,7 +1304,7 @@ def test_c_instruction(ctx_factory):
 def test_dependent_domain_insn_iname_finding(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel([
             "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
             "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
             ],
@@ -1332,11 +1335,8 @@ def test_dependent_domain_insn_iname_finding(ctx_factory):
 
 
 def test_inames_deps_from_write_subscript(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n}",
-            ],
             """
                 <> src_ibox = source_boxes[i]
                 <int32> something = 5
@@ -1352,11 +1352,8 @@ def test_inames_deps_from_write_subscript(ctx_factory):
 
 
 def test_split_reduction(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j,k]: 0<=i,j,k<n}",
-            ],
             """
                 b = sum((i,j,k), a[i,j,k])
                 """,
@@ -1372,9 +1369,8 @@ def test_split_reduction(ctx_factory):
 def test_modulo_indexing(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0], [
+    knl = lp.make_kernel(
             "{[i,j]: 0<=i<n and 0<=j<5}",
-            ],
             """
                 b[i] = sum(j, a[(i+j)%n])
                 """,
@@ -1396,7 +1392,7 @@ def test_rob_stroud_bernstein(ctx_factory):
 
     # NOTE: tmp would have to be zero-filled beforehand
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[el, i2, alpha1,alpha2]: \
                     0 <= el < nels and \
                     0 <= i2 < nqp1d and \
@@ -1448,7 +1444,7 @@ def test_rob_stroud_bernstein_full(ctx_factory):
 
     # NOTE: result would have to be zero-filled beforehand
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                     0 <= el < nels and \
                     0 <= i2 < nqp1d and \
@@ -1517,7 +1513,7 @@ def test_rob_stroud_bernstein_full(ctx_factory):
 def test_vector_types(ctx_factory, vec_len):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{ [i,j]: 0<=i<n and 0<=j<vec_len }",
             "out[i,j] = 2*a[i,j]",
             [
@@ -1547,7 +1543,6 @@ def test_conditional(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
-            ctx.devices[0],
             "{ [i,j]: 0<=i,j<n }",
             """
                 <> my_a = a[i,j] {id=read_a}
@@ -1577,7 +1572,7 @@ def test_ilp_loop_bound(ctx_factory):
     # throughout. In ILP'd loops, not so much.
 
     ctx = ctx_factory()
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "{ [i,j,k]: 0<=i,j,k<n }",
             """
             out[i,k] = sum(j, a[i,j]*b[j,k])
@@ -1604,9 +1599,7 @@ def test_arg_shape_uses_assumptions(ctx_factory):
     # static shape for out, which is at least 1 x 1 in size, but otherwise of
     # size n x n.
 
-    ctx = ctx_factory()
-
-    lp.make_kernel(ctx.devices[0],
+    lp.make_kernel(
             "{ [i,j]: 0<=i,j<n }",
             """
             out[i,j] = 2*a[i,j]
@@ -1618,7 +1611,7 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
         "a[i] = 2*a[i]",
         assumptions="n>=1")
@@ -1651,7 +1644,7 @@ def test_multiple_writes_to_local_temporary(ctx_factory):
 
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
         "{[i,e]: 0<=i<5 and 0<=e<nelements}",
         """
         <> temp[i, 0] = 17
@@ -1659,8 +1652,10 @@ def test_multiple_writes_to_local_temporary(ctx_factory):
         """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    code, _ = lp.generate_code(knl)
-    print code
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    for k in lp.generate_loop_schedules(knl):
+        code, _ = lp.generate_code(k)
+        print code
 
 
 if __name__ == "__main__":
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 7ec973156..65e5658b5 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -40,17 +40,17 @@ def test_nbody(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[N] -> {[i,j,k]: 0<=i,j<N and 0<=k<3 }",
-           [
-               "axdist(k) := x[i,k]-x[j,k]",
-               "invdist := rsqrt(sum_float32(k, axdist(k)**2))",
-               "pot[i] = sum_float32(j, if(i != j, invdist, 0))",
-           ], [
-               lp.GlobalArg("x", dtype, shape="N,3", order="C"),
-               lp.GlobalArg("pot", dtype, shape="N", order="C"),
-               lp.ValueArg("N", np.int32),
-           ], name="nbody", assumptions="N>=1")
+            [
+                "axdist(k) := x[i,k]-x[j,k]",
+                "invdist := rsqrt(sum_float32(k, axdist(k)**2))",
+                "pot[i] = sum_float32(j, if(i != j, invdist, 0))",
+            ], [
+                lp.GlobalArg("x", dtype, shape="N,3", order="C"),
+                lp.GlobalArg("pot", dtype, shape="N", order="C"),
+                lp.ValueArg("N", np.int32),
+            ], name="nbody", assumptions="N>=1")
 
     seq_knl = knl
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index 2de1db43b..cfa23f35d 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -44,19 +44,19 @@ def test_tim2d(ctx_factory):
     field_shape = (K_sym, n, n)
 
     # K - run-time symbolic
-    knl = lp.make_kernel(ctx.devices[0],
+    knl = lp.make_kernel(
             "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
-           [
-            "ur(a,b) := sum(o, D[a,o]*u[e,o,b])",
-            "us(a,b) := sum(o, D[b,o]*u[e,a,o])",
+            [
+                "ur(a,b) := sum(o, D[a,o]*u[e,o,b])",
+                "us(a,b) := sum(o, D[b,o]*u[e,a,o])",
 
-            #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",
+                #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",
 
-            "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)",
-            "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)",
-            "lap[e,i,j]  = "
-            "  sum(m, D[m,i]*Gux(m,j))"
-            "+ sum(m, D[m,j]*Guy(i,m))"
+                "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)",
+                "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)",
+                "lap[e,i,j]  = "
+                "  sum(m, D[m,i]*Gux(m,j))"
+                "+ sum(m, D[m,j]*Guy(i,m))"
 
             ],
             [
-- 
GitLab