diff --git a/loopy/compiled.py b/loopy/compiled.py
index 1568736863ce121194c8837f28849fad6023dd3d..053bc1d87acede5ee865ca4272f53205b37d8c8b 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -507,6 +507,43 @@ def _default_check_result(result, ref_result):
 
 
 
+def _enumerate_cl_devices_for_ref_test():
+    noncpu_devs = []
+    cpu_devs = []
+
+    from warnings import warn
+
+    for pf in cl.get_platforms():
+        if pf.name == "Portable OpenCL":
+            # That implementation [1] isn't quite good enough yet.
+            # [1] https://launchpad.net/pocl
+            # FIXME remove when no longer true.
+            warn("Skipping 'Portable OpenCL' for lack of maturity.")
+            continue
+
+        for dev in pf.get_devices():
+            if dev.type == cl.device_type.CPU:
+                cpu_devs.append(dev)
+            else:
+                noncpu_devs.append(dev)
+
+    if not (cpu_devs or noncpu_devs):
+        raise RuntimeError("no CL device found for test")
+
+    if not cpu_devs:
+        warn("No CPU device found for reference test. The reference computation "
+                "will either fail because of a timeout or take a *very* long "
+                "time.")
+
+    for dev in cpu_devs:
+        yield dev
+
+    for dev in noncpu_devs:
+        yield dev
+
+
+
+
 def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], parameters={},
         print_ref_code=False, print_code=True, warmup_rounds=2,
         edit_code=False, dump_binary=False, codegen_kwargs={},
@@ -552,39 +589,13 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet
         fill_value_ref = -17
         fill_value = fill_value_ref
 
-    # {{{ find candidate devices for reference run
-
-    all_devs = []
-    cpu_devs = []
-
-    for pf in cl.get_platforms():
-        if pf.name == "Portable OpenCL":
-            # That implementation [1] isn't quite good enough yet.
-            # [1] https://launchpad.net/pocl
-            # FIXME remove when no longer true.
-            continue
-
-        for dev in pf.get_devices():
-            all_devs.append(dev)
-            if dev.type == cl.device_type.CPU:
-                cpu_devs.append(dev)
-
-    if not cpu_devs:
-        if not all_devs:
-            raise RuntimeError("no CL device found for test")
-
-        ref_devs = all_devs
-
-        from warnings import warn
-        warn("No CPU device found for reference test.")
-    else:
-        ref_devs = cpu_devs
+    # {{{ compile and run reference code
 
-    # }}}
+    found_ref_device = False
 
-    # {{{ compile and run reference code
+    ref_errors = []
 
-    for dev in ref_devs:
+    for dev in _enumerate_cl_devices_for_ref_test():
         ref_ctx = cl.Context([dev])
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
@@ -602,10 +613,20 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet
             ref_args["out_host"] = False
         except cl.RuntimeError, e:
             if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
+                import traceback
+                ref_errors.append("\n".join([
+                    75*"-",
+                    "On %s:" % dev,
+                    75*"-",
+                    traceback.format_exc(),
+                    75*"-"]))
+
                 continue
             else:
                 raise
 
+        found_ref_device = True
+
         if not do_check:
             break
 
@@ -636,6 +657,10 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet
 
         break
 
+    if not found_ref_device:
+        raise RuntimeError("could not find a suitable device for the reference computation.\n"
+                "These errors were encountered:\n"+"\n".join(ref_errors))
+
     # }}}
 
     # {{{ compile and run parallel code