diff --git a/loopy/__init__.py b/loopy/__init__.py
index b4c7181d17b40df591886e087b4c89bb85d5e088..273e8573e3f7399004480845f10fe8de778bec01 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -68,7 +68,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules
 from loopy.codegen import generate_code
-from loopy.compiled import CompiledKernel, auto_test_vs_ref
+from loopy.compiled import CompiledKernel
+from loopy.auto_test import auto_test_vs_ref
 from loopy.check import check_kernels
 
 __all__ = [
@@ -89,6 +90,7 @@ __all__ = [
         "generate_loop_schedules",
         "generate_code",
         "CompiledKernel", "auto_test_vs_ref", "check_kernels",
+
         "make_kernel",
         "split_iname", "join_inames", "tag_inames", "duplicate_inames",
         "split_dimension", "join_dimensions", "tag_dimensions",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea172db2fb656c7f19f896cbfed089ffdb5f42a9
--- /dev/null
+++ b/loopy/auto_test.py
@@ -0,0 +1,582 @@
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from pytools import Record
+
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.array as cl_array
+
+
+AUTO_TEST_SKIP_RUN = False
+
+
+# {{{ create random argument arrays for testing
+
+def fill_rand(ary):
+    from pyopencl.clrandom import fill_rand
+    if ary.dtype.kind == "c":
+        real_dtype = ary.dtype.type(0).real.dtype
+        real_ary = ary.view(real_dtype)
+
+        fill_rand(real_ary, luxury=0)
+    else:
+        fill_rand(ary, luxury=0)
+
+
+class TestArgInfo(Record):
+    pass
+
+
+# {{{ "reference" arguments
+
+def make_ref_args(kernel, cl_arg_info, queue, parameters, fill_value):
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
+
+    from pymbolic import evaluate
+
+    ref_args = {}
+    ref_arg_data = []
+
+    for arg in cl_arg_info:
+        if arg.arg_class is ValueArg:
+            if arg.offset_for_name:
+                continue
+
+            arg_value = parameters[arg.name]
+
+            try:
+                argv_dtype = arg_value.dtype
+            except AttributeError:
+                argv_dtype = None
+
+            if argv_dtype != arg.dtype:
+                arg_value = arg.dtype.type(arg_value)
+
+            ref_args[arg.name] = arg_value
+
+            ref_arg_data.append(None)
+
+        elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg:
+            if arg.shape is None:
+                raise ValueError("arrays need known shape to use automatic "
+                        "testing")
+
+            shape = evaluate(arg.shape, parameters)
+
+            is_output = arg.base_name in kernel.get_written_variables()
+
+            if arg.arg_class is ImageArg:
+                storage_array = ary = cl_array.empty(
+                        queue, shape, arg.dtype, order="C")
+                numpy_strides = None
+                alloc_size = None
+                strides = None
+            else:
+                strides = evaluate(arg.strides, parameters)
+
+                from pytools import all
+                assert all(s > 0 for s in strides)
+                alloc_size = sum(astrd*(alen-1)
+                        for alen, astrd in zip(shape, strides)) + 1
+
+                dtype = arg.dtype
+                if dtype is None:
+                    raise RuntimeError("dtype for argument '%s' is not yet "
+                            "known. Perhaps you want to use "
+                            "loopy.add_argument_dtypes "
+                            "or loopy.infer_argument_dtypes?"
+                            % arg.name)
+
+                itemsize = dtype.itemsize
+                numpy_strides = [itemsize*s for s in strides]
+
+                storage_array = cl_array.empty(queue, alloc_size, dtype)
+                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+
+            if is_output:
+                if arg.arg_class is ImageArg:
+                    raise RuntimeError("write-mode images not supported in "
+                            "automatic testing")
+
+                if dtype.isbuiltin:
+                    storage_array.fill(fill_value)
+                else:
+                    from warnings import warn
+                    warn("Cannot pre-fill array of dtype '%s'" % dtype)
+
+                ref_args[arg.name] = ary
+            else:
+                fill_rand(storage_array)
+                if arg.arg_class is ImageArg:
+                    # must be contiguous
+                    ref_args[arg.name] = cl.image_from_array(
+                            queue.context, ary.get())
+                else:
+                    ref_args[arg.name] = ary
+
+            ref_arg_data.append(
+                    TestArgInfo(
+                        name=arg.name,
+                        ref_array=ary,
+                        ref_storage_array=storage_array,
+                        ref_shape=shape,
+                        ref_strides=strides,
+                        ref_alloc_size=alloc_size,
+                        ref_numpy_strides=numpy_strides,
+                        needs_checking=is_output))
+        else:
+            raise RuntimeError("arg type not understood")
+
+    return ref_args, ref_arg_data
+
+# }}}
+
+
+# {{{ "full-scale" arguments
+
+def make_args(kernel, cl_arg_info, queue, ref_arg_data, parameters,
+        fill_value):
+    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
+
+    from pymbolic import evaluate
+
+    args = {}
+    for arg, arg_desc in zip(cl_arg_info, ref_arg_data):
+        if arg.arg_class is ValueArg:
+            arg_value = parameters[arg.name]
+
+            try:
+                argv_dtype = arg_value.dtype
+            except AttributeError:
+                argv_dtype = None
+
+            if argv_dtype != arg.dtype:
+                arg_value = arg.dtype.type(arg_value)
+
+            args[arg.name] = arg_value
+
+        elif arg.arg_class is ImageArg:
+            if arg.name in kernel.get_written_variables():
+                raise NotImplementedError("write-mode images not supported in "
+                        "automatic testing")
+
+            shape = evaluate(arg.shape, parameters)
+            assert shape == arg_desc.ref_shape
+
+            # must be contiguous
+            args[arg.name] = cl.image_from_array(
+                    queue.context, arg_desc.ref_array.get())
+
+        elif arg.arg_class is GlobalArg:
+            shape = evaluate(arg.shape, parameters)
+            strides = evaluate(arg.strides, parameters)
+
+            itemsize = arg.dtype.itemsize
+            numpy_strides = [itemsize*s for s in strides]
+
+            assert all(s > 0 for s in strides)
+            alloc_size = sum(astrd*(alen-1)
+                    for alen, astrd in zip(shape, strides)) + 1
+
+            if arg.base_name in kernel.get_written_variables():
+                storage_array = cl_array.empty(queue, alloc_size, arg.dtype)
+                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+
+                if arg.dtype.isbuiltin:
+                    storage_array.fill(fill_value)
+                else:
+                    from warnings import warn
+                    warn("Cannot pre-fill array of dtype '%s'" % arg.dtype)
+
+                args[arg.name] = ary
+            else:
+                # use contiguous array to transfer to host
+                host_ref_contig_array = arg_desc.ref_storage_array.get()
+
+                # use device shape/strides
+                from pyopencl.compyte.array import as_strided
+                host_ref_array = as_strided(host_ref_contig_array,
+                        arg_desc.ref_shape, arg_desc.ref_numpy_strides)
+
+                # flatten the thing
+                host_ref_flat_array = host_ref_array.flatten()
+
+                # create host array with test shape (but not strides)
+                host_contig_array = np.empty(shape, dtype=arg.dtype)
+
+                common_len = min(
+                        len(host_ref_flat_array),
+                        len(host_contig_array.ravel()))
+                host_contig_array.ravel()[:common_len] = \
+                        host_ref_flat_array[:common_len]
+
+                # create host array with test shape and storage layout
+                host_storage_array = np.empty(alloc_size, arg.dtype)
+                host_array = as_strided(
+                        host_storage_array, shape, numpy_strides)
+                host_array[:] = host_contig_array
+
+                host_contig_array = arg_desc.ref_storage_array.get()
+                storage_array = cl_array.to_device(queue, host_storage_array)
+                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
+
+                args[arg.name] = ary
+
+            arg_desc.test_storage_array = storage_array
+            arg_desc.test_array = ary
+            arg_desc.test_shape = shape
+            arg_desc.test_strides = strides
+            arg_desc.test_numpy_strides = numpy_strides
+            arg_desc.test_alloc_size = alloc_size
+
+        else:
+            raise RuntimeError("arg type not understood")
+
+    return args
+
+# }}}
+
+# }}}
+
+
+# {{{ default array comparison
+
+def _default_check_result(result, ref_result):
+    if not np.allclose(ref_result, result, rtol=1e-3, atol=1e-3):
+        l2_err = (
+                np.sum(np.abs(ref_result-result)**2)
+                /
+                np.sum(np.abs(ref_result)**2))
+        linf_err = (
+                np.max(np.abs(ref_result-result))
+                /
+                np.max(np.abs(ref_result-result)))
+        return (False,
+                "results do not match(rel) l_2 err: %g, l_inf err: %g"
+                % (l2_err, linf_err))
+    else:
+        return True, None
+
+# }}}
+
+
+# {{{ ref device finder
+
+def _enumerate_cl_devices_for_ref_test():
+    noncpu_devs = []
+    cpu_devs = []
+
+    from warnings import warn
+
+    for pf in cl.get_platforms():
+        if pf.name == "Portable OpenCL":
+            # That implementation [1] isn't quite good enough yet.
+            # [1] https://launchpad.net/pocl
+            # FIXME remove when no longer true.
+            warn("Skipping 'Portable OpenCL' for lack of maturity.")
+            continue
+
+        for dev in pf.get_devices():
+            if dev.type == cl.device_type.CPU:
+                cpu_devs.append(dev)
+            else:
+                noncpu_devs.append(dev)
+
+    if not (cpu_devs or noncpu_devs):
+        raise RuntimeError("no CL device found for test")
+
+    if not cpu_devs:
+        warn("No CPU device found for reference test. The reference "
+                "computation will either fail because of a timeout "
+                "or take a *very* long time.")
+
+    for dev in cpu_devs:
+        yield dev
+
+    for dev in noncpu_devs:
+        yield dev
+
+# }}}
+
+
+# {{{ main automatic testing entrypoint
+
+def auto_test_vs_ref(
+        ref_knl, ctx, kernel_gen, op_count=[], op_label=[], parameters={},
+        print_ref_code=False, print_code=True, warmup_rounds=2,
+        code_op=None, dump_binary=False, codegen_kwargs={},
+        options=[],
+        fills_entire_output=True, do_check=True, check_result=None
+        ):
+    """Compare results of `ref_knl` to the kernels generated by the generator
+    `kernel_gen`.
+
+    :arg check_result: a callable with :class:`numpy.ndarray` arguments
+        *(result, reference_result)* returning a a tuple (class:`bool`,
+        message) indicating correctness/acceptability of the result
+    """
+
+    from loopy.compiled import CompiledKernel, get_highlighted_code
+
+    if isinstance(op_count, (int, float)):
+        from warnings import warn
+        warn("op_count should be a list", stacklevel=2)
+        op_count = [op_count]
+    if isinstance(op_label, str):
+        from warnings import warn
+        warn("op_label should be a list", stacklevel=2)
+        op_label = [op_label]
+
+    read_and_written_args = (
+            ref_knl.get_read_variables()
+            & ref_knl.get_written_variables()
+            & set(ref_knl.arg_dict))
+
+    if read_and_written_args:
+        # FIXME: In principle, that's possible to test
+        raise RuntimeError("kernel reads *and* writes argument(s) '%s' "
+                "and therefore cannot be automatically tested"
+                % ", ".join(read_and_written_args))
+
+    from time import time
+
+    if check_result is None:
+        check_result = _default_check_result
+
+    if fills_entire_output:
+        fill_value_ref = -17
+        fill_value = -18
+    else:
+        fill_value_ref = -17
+        fill_value = fill_value_ref
+
+    # {{{ compile and run reference code
+
+    found_ref_device = False
+
+    ref_errors = []
+
+    for dev in _enumerate_cl_devices_for_ref_test():
+        ref_ctx = cl.Context([dev])
+        ref_queue = cl.CommandQueue(ref_ctx,
+                properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+        import loopy as lp
+        ref_kernel_gen = lp.generate_loop_schedules(ref_knl)
+        for knl in lp.check_kernels(ref_kernel_gen, parameters):
+            ref_sched_kernel = knl
+            break
+
+        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel,
+                options=options, codegen_kwargs=codegen_kwargs)
+        if print_ref_code:
+            print 75*"-"
+            print "Reference Code:"
+            print 75*"-"
+            print get_highlighted_code(ref_compiled.code)
+            print 75*"-"
+
+        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())
+
+        try:
+            ref_args, ref_arg_data = \
+                    make_ref_args(ref_sched_kernel, ref_cl_kernel_info.cl_arg_info,
+                            ref_queue, parameters,
+                            fill_value=fill_value_ref)
+            ref_args["out_host"] = False
+        except cl.RuntimeError, e:
+            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
+                import traceback
+                ref_errors.append("\n".join([
+                    75*"-",
+                    "On %s:" % dev,
+                    75*"-",
+                    traceback.format_exc(),
+                    75*"-"]))
+
+                continue
+            else:
+                raise
+
+        found_ref_device = True
+
+        if not do_check:
+            break
+
+        ref_queue.finish()
+        ref_start = time()
+
+        print "using %s for the reference calculation" % dev
+
+        if not AUTO_TEST_SKIP_RUN:
+            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
+        else:
+            ref_evt = cl.enqueue_marker(ref_queue)
+
+        ref_queue.finish()
+        ref_stop = time()
+        ref_elapsed_wall = ref_stop-ref_start
+
+        ref_evt.wait()
+        ref_elapsed = 1e-9*(ref_evt.profile.END-ref_evt.profile.SUBMIT)
+
+        break
+
+    if not found_ref_device:
+        raise RuntimeError("could not find a suitable device for the "
+                "reference computation.\n"
+                "These errors were encountered:\n"+"\n".join(ref_errors))
+
+    # }}}
+
+    # {{{ compile and run parallel code
+
+    need_check = do_check
+
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    args = None
+    for i, kernel in enumerate(kernel_gen):
+        compiled = CompiledKernel(ctx, kernel, options=options,
+                codegen_kwargs=codegen_kwargs)
+
+        if args is None:
+            cl_kernel_info = compiled.cl_kernel_info(frozenset())
+
+            args = make_args(kernel, cl_kernel_info.cl_arg_info,
+                    queue, ref_arg_data, parameters, fill_value=fill_value)
+        args["out_host"] = False
+
+        print 75*"-"
+        print "Kernel #%d:" % i
+        print 75*"-"
+        if print_code:
+            print compiled.get_highlighted_code()
+            print 75*"-"
+        if dump_binary:
+            print type(compiled.cl_program)
+            print compiled.cl_program.binaries[0]
+            print 75*"-"
+
+        for i in range(warmup_rounds):
+            if not AUTO_TEST_SKIP_RUN:
+                compiled(queue, code_op=code_op, **args)
+
+            if need_check and not AUTO_TEST_SKIP_RUN:
+                for arg_desc in ref_arg_data:
+                    if arg_desc is None:
+                        continue
+                    if not arg_desc.needs_checking:
+                        continue
+
+                    from pyopencl.compyte.array import as_strided
+                    ref_ary = as_strided(
+                            arg_desc.ref_storage_array.get(),
+                            shape=arg_desc.ref_shape,
+                            strides=arg_desc.ref_numpy_strides).flatten()
+                    test_ary = as_strided(
+                            arg_desc.test_storage_array.get(),
+                            shape=arg_desc.test_shape,
+                            strides=arg_desc.test_numpy_strides).flatten()
+                    common_len = min(len(ref_ary), len(test_ary))
+                    ref_ary = ref_ary[:common_len]
+                    test_ary = test_ary[:common_len]
+
+                    error_is_small, error = check_result(test_ary, ref_ary)
+                    assert error_is_small, error
+                    need_check = False
+
+        events = []
+        queue.finish()
+
+        timing_rounds = warmup_rounds
+
+        while True:
+            from time import time
+            start_time = time()
+
+            evt_start = cl.enqueue_marker(queue)
+
+            for i in range(timing_rounds):
+                if not AUTO_TEST_SKIP_RUN:
+                    evt, _ = compiled(queue, code_op=code_op, **args)
+                    events.append(evt)
+                else:
+                    events.append(cl.enqueue_marker(queue))
+
+            evt_end = cl.enqueue_marker(queue)
+
+            queue.finish()
+            stop_time = time()
+
+            for evt in events:
+                evt.wait()
+            evt_start.wait()
+            evt_end.wait()
+
+            elapsed = (1e-9*events[-1].profile.END
+                    - 1e-9*events[0].profile.SUBMIT) \
+                    / timing_rounds
+            try:
+                elapsed_evt_2 = "%g" % \
+                        ((1e-9*evt_end.profile.START
+                            - 1e-9*evt_start.profile.START)
+                        / timing_rounds)
+            except cl.RuntimeError:
+                elapsed_evt_2 = "<unavailable>"
+
+            elapsed_wall = (stop_time-start_time)/timing_rounds
+
+            if elapsed_wall * timing_rounds < 0.3:
+                timing_rounds *= 4
+            else:
+                break
+
+        rates = ""
+        for cnt, lbl in zip(op_count, op_label):
+            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
+
+        print("elapsed: %g s event, %s s marker-event %g s wall "
+                "(%d rounds)%s" % (
+                elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates))
+
+        if do_check:
+            ref_rates = ""
+            for cnt, lbl in zip(op_count, op_label):
+                ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl)
+            print "ref: elapsed: %g s event, %g s wall%s" % (
+                    ref_elapsed, ref_elapsed_wall, ref_rates)
+
+    # }}}
+
+# }}}
+
+from pytools import MovedFunctionDeprecationWrapper
+
+auto_test_vs_seq = MovedFunctionDeprecationWrapper(auto_test_vs_ref)
+
+# vim: foldmethod=marker
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 8312b99d2702a653d45cf0014c7970f68170a0e4..c5a44b83761b8c4590a52e64d7811e50e638666e 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -30,7 +30,70 @@ import numpy as np
 
 from pytools import Record, memoize_method
 
-AUTO_TEST_SKIP_RUN = False
+
+# {{{ domain parameter finder
+
+class DomainParameterFinder(object):
+    """Finds parameters from shapes of passed arguments."""
+
+    def __init__(self, kernel, cl_arg_info):
+        # a mapping from parameter names to a list of tuples
+        # (arg_name, axis_nr, function), where function is a
+        # unary function of kernel.arg_dict[arg_name].shape[axis_nr]
+        # returning the desired parameter.
+        self.param_to_sources = param_to_sources = {}
+
+        param_names = kernel.all_params()
+
+        from loopy.kernel.data import GlobalArg
+        from loopy.symbolic import DependencyMapper
+        from pymbolic import compile
+        dep_map = DependencyMapper()
+
+        from pymbolic import var
+        for arg in cl_arg_info:
+            if arg.arg_class is GlobalArg:
+                for axis_nr, shape_i in enumerate(arg.shape):
+                    deps = dep_map(shape_i)
+                    if len(deps) == 1:
+                        dep, = deps
+
+                        if dep.name in param_names:
+                            from pymbolic.algorithm import solve_affine_equations_for
+                            try:
+                                # friggin' overkill :)
+                                param_expr = solve_affine_equations_for(
+                                        [dep.name], [(shape_i, var("shape_i"))]
+                                        )[dep.name]
+                            except:
+                                # went wrong? oh well
+                                pass
+                            else:
+                                param_func = compile(param_expr, ["shape_i"])
+                                param_to_sources.setdefault(dep.name, []).append(
+                                        (arg.name, axis_nr, param_func))
+
+    def __call__(self, kwargs):
+        result = {}
+
+        for param_name, sources in self.param_to_sources.iteritems():
+            if param_name not in kwargs:
+                for arg_name, axis_nr, shape_func in sources:
+                    if arg_name in kwargs:
+                        try:
+                            shape_axis = kwargs[arg_name].shape[axis_nr]
+                        except IndexError:
+                            raise RuntimeError("Argument '%s' has unexpected shape. "
+                                    "Tried to access axis %d (0-based), only %d "
+                                    "axes present." %
+                                    (arg_name, axis_nr, len(kwargs[arg_name].shape)))
+
+                        result[param_name] = shape_func(shape_axis)
+                        continue
+
+        return result
+
+# }}}
 
 
 # {{{ argument checking
@@ -201,7 +264,8 @@ class CompiledKernel:
 
         return kernel_info.copy(
                 cl_kernel=cl_kernel,
-                cl_arg_info=cl_arg_info)
+                cl_arg_info=cl_arg_info,
+                domain_parameter_finder=DomainParameterFinder(kernel, cl_arg_info))
 
     # {{{ debugging aids
 
@@ -271,7 +335,7 @@ class CompiledKernel:
         # }}}
 
         kwargs.update(
-                kernel.domain_parameter_finder()(kwargs))
+                kernel_info.domain_parameter_finder(kwargs))
 
         domain_parameters = dict((name, int(kwargs[name]))
                 for name in kernel.scalar_loop_args)
@@ -405,528 +469,4 @@ def get_highlighted_code(text):
         return highlight(text, CLexer(), TerminalFormatter())
 
 
-# {{{ automatic testing
-
-def fill_rand(ary):
-    from pyopencl.clrandom import fill_rand
-    if ary.dtype.kind == "c":
-        real_dtype = ary.dtype.type(0).real.dtype
-        real_ary = ary.view(real_dtype)
-
-        fill_rand(real_ary, luxury=0)
-    else:
-        fill_rand(ary, luxury=0)
-
-
-class TestArgInfo(Record):
-    pass
-
-
-def make_ref_args(kernel, cl_arg_info, queue, parameters, fill_value):
-    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
-
-    from pymbolic import evaluate
-
-    ref_args = {}
-    ref_arg_data = []
-
-    for arg in cl_arg_info:
-        if arg.arg_class is ValueArg:
-            if arg.offset_for_name:
-                continue
-
-            arg_value = parameters[arg.name]
-
-            try:
-                argv_dtype = arg_value.dtype
-            except AttributeError:
-                argv_dtype = None
-
-            if argv_dtype != arg.dtype:
-                arg_value = arg.dtype.type(arg_value)
-
-            ref_args[arg.name] = arg_value
-
-            ref_arg_data.append(None)
-
-        elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg:
-            if arg.shape is None:
-                raise ValueError("arrays need known shape to use automatic "
-                        "testing")
-
-            shape = evaluate(arg.shape, parameters)
-
-            is_output = arg.base_name in kernel.get_written_variables()
-
-            if arg.arg_class is ImageArg:
-                storage_array = ary = cl_array.empty(
-                        queue, shape, arg.dtype, order="C")
-                numpy_strides = None
-                alloc_size = None
-                strides = None
-            else:
-                strides = evaluate(arg.strides, parameters)
-
-                from pytools import all
-                assert all(s > 0 for s in strides)
-                alloc_size = sum(astrd*(alen-1)
-                        for alen, astrd in zip(shape, strides)) + 1
-
-                dtype = arg.dtype
-                if dtype is None:
-                    raise RuntimeError("dtype for argument '%s' is not yet "
-                            "known. Perhaps you want to use "
-                            "loopy.add_argument_dtypes "
-                            "or loopy.infer_argument_dtypes?"
-                            % arg.name)
-
-                itemsize = dtype.itemsize
-                numpy_strides = [itemsize*s for s in strides]
-
-                storage_array = cl_array.empty(queue, alloc_size, dtype)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
-
-            if is_output:
-                if arg.arg_class is ImageArg:
-                    raise RuntimeError("write-mode images not supported in "
-                            "automatic testing")
-
-                if dtype.isbuiltin:
-                    storage_array.fill(fill_value)
-                else:
-                    from warnings import warn
-                    warn("Cannot pre-fill array of dtype '%s'" % dtype)
-
-                ref_args[arg.name] = ary
-            else:
-                fill_rand(storage_array)
-                if arg.arg_class is ImageArg:
-                    # must be contiguous
-                    ref_args[arg.name] = cl.image_from_array(
-                            queue.context, ary.get())
-                else:
-                    ref_args[arg.name] = ary
-
-            ref_arg_data.append(
-                    TestArgInfo(
-                        name=arg.name,
-                        ref_array=ary,
-                        ref_storage_array=storage_array,
-                        ref_shape=shape,
-                        ref_strides=strides,
-                        ref_alloc_size=alloc_size,
-                        ref_numpy_strides=numpy_strides,
-                        needs_checking=is_output))
-        else:
-            raise RuntimeError("arg type not understood")
-
-    return ref_args, ref_arg_data
-
-
-def make_args(kernel, cl_arg_info, queue, ref_arg_data, parameters,
-        fill_value):
-    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg
-
-    from pymbolic import evaluate
-
-    args = {}
-    for arg, arg_desc in zip(cl_arg_info, ref_arg_data):
-        if arg.arg_class is ValueArg:
-            arg_value = parameters[arg.name]
-
-            try:
-                argv_dtype = arg_value.dtype
-            except AttributeError:
-                argv_dtype = None
-
-            if argv_dtype != arg.dtype:
-                arg_value = arg.dtype.type(arg_value)
-
-            args[arg.name] = arg_value
-
-        elif arg.arg_class is ImageArg:
-            if arg.name in kernel.get_written_variables():
-                raise NotImplementedError("write-mode images not supported in "
-                        "automatic testing")
-
-            shape = evaluate(arg.shape, parameters)
-            assert shape == arg_desc.ref_shape
-
-            # must be contiguous
-            args[arg.name] = cl.image_from_array(
-                    queue.context, arg_desc.ref_array.get())
-
-        elif arg.arg_class is GlobalArg:
-            shape = evaluate(arg.shape, parameters)
-            strides = evaluate(arg.strides, parameters)
-
-            itemsize = arg.dtype.itemsize
-            numpy_strides = [itemsize*s for s in strides]
-
-            assert all(s > 0 for s in strides)
-            alloc_size = sum(astrd*(alen-1)
-                    for alen, astrd in zip(shape, strides)) + 1
-
-            if arg.base_name in kernel.get_written_variables():
-                storage_array = cl_array.empty(queue, alloc_size, arg.dtype)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
-
-                if arg.dtype.isbuiltin:
-                    storage_array.fill(fill_value)
-                else:
-                    from warnings import warn
-                    warn("Cannot pre-fill array of dtype '%s'" % arg.dtype)
-
-                args[arg.name] = ary
-            else:
-                # use contiguous array to transfer to host
-                host_ref_contig_array = arg_desc.ref_storage_array.get()
-
-                # use device shape/strides
-                from pyopencl.compyte.array import as_strided
-                host_ref_array = as_strided(host_ref_contig_array,
-                        arg_desc.ref_shape, arg_desc.ref_numpy_strides)
-
-                # flatten the thing
-                host_ref_flat_array = host_ref_array.flatten()
-
-                # create host array with test shape (but not strides)
-                host_contig_array = np.empty(shape, dtype=arg.dtype)
-
-                common_len = min(
-                        len(host_ref_flat_array),
-                        len(host_contig_array.ravel()))
-                host_contig_array.ravel()[:common_len] = \
-                        host_ref_flat_array[:common_len]
-
-                # create host array with test shape and storage layout
-                host_storage_array = np.empty(alloc_size, arg.dtype)
-                host_array = as_strided(
-                        host_storage_array, shape, numpy_strides)
-                host_array[:] = host_contig_array
-
-                host_contig_array = arg_desc.ref_storage_array.get()
-                storage_array = cl_array.to_device(queue, host_storage_array)
-                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
-
-                args[arg.name] = ary
-
-            arg_desc.test_storage_array = storage_array
-            arg_desc.test_array = ary
-            arg_desc.test_shape = shape
-            arg_desc.test_strides = strides
-            arg_desc.test_numpy_strides = numpy_strides
-            arg_desc.test_alloc_size = alloc_size
-
-        else:
-            raise RuntimeError("arg type not understood")
-
-    return args
-
-
-def _default_check_result(result, ref_result):
-    if not np.allclose(ref_result, result, rtol=1e-3, atol=1e-3):
-        l2_err = (
-                np.sum(np.abs(ref_result-result)**2)
-                /
-                np.sum(np.abs(ref_result)**2))
-        linf_err = (
-                np.max(np.abs(ref_result-result))
-                /
-                np.max(np.abs(ref_result-result)))
-        return (False,
-                "results do not match(rel) l_2 err: %g, l_inf err: %g"
-                % (l2_err, linf_err))
-    else:
-        return True, None
-
-
-def _enumerate_cl_devices_for_ref_test():
-    noncpu_devs = []
-    cpu_devs = []
-
-    from warnings import warn
-
-    for pf in cl.get_platforms():
-        if pf.name == "Portable OpenCL":
-            # That implementation [1] isn't quite good enough yet.
-            # [1] https://launchpad.net/pocl
-            # FIXME remove when no longer true.
-            warn("Skipping 'Portable OpenCL' for lack of maturity.")
-            continue
-
-        for dev in pf.get_devices():
-            if dev.type == cl.device_type.CPU:
-                cpu_devs.append(dev)
-            else:
-                noncpu_devs.append(dev)
-
-    if not (cpu_devs or noncpu_devs):
-        raise RuntimeError("no CL device found for test")
-
-    if not cpu_devs:
-        warn("No CPU device found for reference test. The reference "
-                "computation will either fail because of a timeout "
-                "or take a *very* long time.")
-
-    for dev in cpu_devs:
-        yield dev
-
-    for dev in noncpu_devs:
-        yield dev
-
-
-def auto_test_vs_ref(
-        ref_knl, ctx, kernel_gen, op_count=[], op_label=[], parameters={},
-        print_ref_code=False, print_code=True, warmup_rounds=2,
-        code_op=None, dump_binary=False, codegen_kwargs={},
-        options=[],
-        fills_entire_output=True, do_check=True, check_result=None
-        ):
-    """Compare results of `ref_knl` to the kernels generated by the generator
-    `kernel_gen`.
-
-    :arg check_result: a callable with :class:`numpy.ndarray` arguments
-        *(result, reference_result)* returning a a tuple (class:`bool`,
-        message) indicating correctness/acceptability of the result
-    """
-
-    if isinstance(op_count, (int, float)):
-        from warnings import warn
-        warn("op_count should be a list", stacklevel=2)
-        op_count = [op_count]
-    if isinstance(op_label, str):
-        from warnings import warn
-        warn("op_label should be a list", stacklevel=2)
-        op_label = [op_label]
-
-    read_and_written_args = (
-            ref_knl.get_read_variables()
-            & ref_knl.get_written_variables()
-            & set(ref_knl.arg_dict))
-
-    if read_and_written_args:
-        # FIXME: In principle, that's possible to test
-        raise RuntimeError("kernel reads *and* writes argument(s) '%s' "
-                "and therefore cannot be automatically tested"
-                % ", ".join(read_and_written_args))
-
-    from time import time
-
-    if check_result is None:
-        check_result = _default_check_result
-
-    if fills_entire_output:
-        fill_value_ref = -17
-        fill_value = -18
-    else:
-        fill_value_ref = -17
-        fill_value = fill_value_ref
-
-    # {{{ compile and run reference code
-
-    found_ref_device = False
-
-    ref_errors = []
-
-    for dev in _enumerate_cl_devices_for_ref_test():
-        ref_ctx = cl.Context([dev])
-        ref_queue = cl.CommandQueue(ref_ctx,
-                properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-        import loopy as lp
-        ref_kernel_gen = lp.generate_loop_schedules(ref_knl)
-        for knl in lp.check_kernels(ref_kernel_gen, parameters):
-            ref_sched_kernel = knl
-            break
-
-        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel,
-                options=options, codegen_kwargs=codegen_kwargs)
-        if print_ref_code:
-            print 75*"-"
-            print "Reference Code:"
-            print 75*"-"
-            print get_highlighted_code(ref_compiled.code)
-            print 75*"-"
-
-        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())
-
-        try:
-            ref_args, ref_arg_data = \
-                    make_ref_args(ref_sched_kernel, ref_cl_kernel_info.cl_arg_info,
-                            ref_queue, parameters,
-                            fill_value=fill_value_ref)
-            ref_args["out_host"] = False
-        except cl.RuntimeError, e:
-            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
-                import traceback
-                ref_errors.append("\n".join([
-                    75*"-",
-                    "On %s:" % dev,
-                    75*"-",
-                    traceback.format_exc(),
-                    75*"-"]))
-
-                continue
-            else:
-                raise
-
-        found_ref_device = True
-
-        if not do_check:
-            break
-
-        ref_queue.finish()
-        ref_start = time()
-
-        print "using %s for the reference calculation" % dev
-
-        if not AUTO_TEST_SKIP_RUN:
-            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
-        else:
-            ref_evt = cl.enqueue_marker(ref_queue)
-
-        ref_queue.finish()
-        ref_stop = time()
-        ref_elapsed_wall = ref_stop-ref_start
-
-        ref_evt.wait()
-        ref_elapsed = 1e-9*(ref_evt.profile.END-ref_evt.profile.SUBMIT)
-
-        break
-
-    if not found_ref_device:
-        raise RuntimeError("could not find a suitable device for the "
-                "reference computation.\n"
-                "These errors were encountered:\n"+"\n".join(ref_errors))
-
-    # }}}
-
-    # {{{ compile and run parallel code
-
-    need_check = do_check
-
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-    args = None
-    for i, kernel in enumerate(kernel_gen):
-        compiled = CompiledKernel(ctx, kernel, options=options,
-                codegen_kwargs=codegen_kwargs)
-
-        if args is None:
-            cl_kernel_info = compiled.cl_kernel_info(frozenset())
-
-            args = make_args(kernel, cl_kernel_info.cl_arg_info,
-                    queue, ref_arg_data, parameters, fill_value=fill_value)
-        args["out_host"] = False
-
-        print 75*"-"
-        print "Kernel #%d:" % i
-        print 75*"-"
-        if print_code:
-            print compiled.get_highlighted_code()
-            print 75*"-"
-        if dump_binary:
-            print type(compiled.cl_program)
-            print compiled.cl_program.binaries[0]
-            print 75*"-"
-
-        for i in range(warmup_rounds):
-            if not AUTO_TEST_SKIP_RUN:
-                compiled(queue, code_op=code_op, **args)
-
-            if need_check and not AUTO_TEST_SKIP_RUN:
-                for arg_desc in ref_arg_data:
-                    if arg_desc is None:
-                        continue
-                    if not arg_desc.needs_checking:
-                        continue
-
-                    from pyopencl.compyte.array import as_strided
-                    ref_ary = as_strided(
-                            arg_desc.ref_storage_array.get(),
-                            shape=arg_desc.ref_shape,
-                            strides=arg_desc.ref_numpy_strides).flatten()
-                    test_ary = as_strided(
-                            arg_desc.test_storage_array.get(),
-                            shape=arg_desc.test_shape,
-                            strides=arg_desc.test_numpy_strides).flatten()
-                    common_len = min(len(ref_ary), len(test_ary))
-                    ref_ary = ref_ary[:common_len]
-                    test_ary = test_ary[:common_len]
-
-                    error_is_small, error = check_result(test_ary, ref_ary)
-                    assert error_is_small, error
-                    need_check = False
-
-        events = []
-        queue.finish()
-
-        timing_rounds = warmup_rounds
-
-        while True:
-            from time import time
-            start_time = time()
-
-            evt_start = cl.enqueue_marker(queue)
-
-            for i in range(timing_rounds):
-                if not AUTO_TEST_SKIP_RUN:
-                    evt, _ = compiled(queue, code_op=code_op, **args)
-                    events.append(evt)
-                else:
-                    events.append(cl.enqueue_marker(queue))
-
-            evt_end = cl.enqueue_marker(queue)
-
-            queue.finish()
-            stop_time = time()
-
-            for evt in events:
-                evt.wait()
-            evt_start.wait()
-            evt_end.wait()
-
-            elapsed = (1e-9*events[-1].profile.END
-                    - 1e-9*events[0].profile.SUBMIT) \
-                    / timing_rounds
-            try:
-                elapsed_evt_2 = "%g" % \
-                        ((1e-9*evt_end.profile.START
-                            - 1e-9*evt_start.profile.START)
-                        / timing_rounds)
-            except cl.RuntimeError:
-                elapsed_evt_2 = "<unavailable>"
-
-            elapsed_wall = (stop_time-start_time)/timing_rounds
-
-            if elapsed_wall * timing_rounds < 0.3:
-                timing_rounds *= 4
-            else:
-                break
-
-        rates = ""
-        for cnt, lbl in zip(op_count, op_label):
-            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
-
-        print("elapsed: %g s event, %s s marker-event %g s wall "
-                "(%d rounds)%s" % (
-                elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates))
-
-        if do_check:
-            ref_rates = ""
-            for cnt, lbl in zip(op_count, op_label):
-                ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl)
-            print "ref: elapsed: %g s event, %g s wall%s" % (
-                    ref_elapsed, ref_elapsed_wall, ref_rates)
-
-    # }}}
-
-from pytools import MovedFunctionDeprecationWrapper
-
-auto_test_vs_seq = MovedFunctionDeprecationWrapper(auto_test_vs_ref)
-
-# }}}
-
 # vim: foldmethod=marker
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 744805e9c8c715ca5dd8ce50717bbbe6b3095e86..46b79186c22870fc8ec6354d4e6cf5d6f3ca420e 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -894,11 +894,6 @@ class LoopKernel(Record):
 
     # }}}
 
-    @memoize_method
-    def domain_parameter_finder(self):
-        from loopy.kernel.tools import DomainParameterFinder
-        return DomainParameterFinder(self)
-
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 47707fb07eb3f9ae4301f9ca3e04e3ccabd34198..652edf8e55da944622ef6c1ea50d69d1c1789909 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -658,7 +658,7 @@ class ArrayBase(Record):
 
                 for i in xrange(shape_i):
                     for res in gen_decls(name_suffix + "_s%d" % i,
-                            shape, dtype,
+                            shape, strides, dtype,
                             user_index + (i,)):
                         yield res