From 89f3ee561a8022123247af3bc892737ec91dfd44 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 24 Aug 2012 23:58:34 -0400
Subject: [PATCH] Add, use instruction priority. Add {options} instruction
 syntax.

---
 MEMO                                     | 10 ++-
 doc/reference.rst                        |  7 +++
 loopy/__init__.py                        | 76 +++++++++++++++++++++++
 loopy/kernel.py                          | 77 +++++++++++++++++-------
 loopy/schedule.py                        | 72 ++++++++++++++--------
 test/test_linalg.py                      | 48 ++++++---------
 test/test_loopy.py                       |  2 +-
 {proto-tests => test}/test_sem_reagan.py |  0
 8 files changed, 206 insertions(+), 86 deletions(-)
 rename {proto-tests => test}/test_sem_reagan.py (100%)

diff --git a/MEMO b/MEMO
index aeeb733a0..d9d823b65 100644
--- a/MEMO
+++ b/MEMO
@@ -41,8 +41,6 @@ Things to consider
 To-do
 ^^^^^
 
-- Clean up loopy.kernel.
-
 - Group instructions by dependency/inames for scheduling, to
   increase sched. scalability
 
@@ -51,10 +49,6 @@ To-do
 - What if no universally valid precompute base index expression is found?
   (test_intel_matrix_mul with n = 6*16, e.g.?)
 
-- Add dependencies after the fact
-
-- Scalar insn priority
-
 - If finding a maximum proves troublesome, move parameters into the domain
 
 - : (as in, Matlab full-slice) in prefetches
@@ -111,6 +105,10 @@ Future ideas
 Dealt with
 ^^^^^^^^^^
 
+- Add dependencies after the fact
+
+- Scalar insn priority
+
 - ScalarArg is a bad name
   -> renamed to ValueArg
 
diff --git a/doc/reference.rst b/doc/reference.rst
index 69b1b0fe2..9b9784bef 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -182,6 +182,13 @@ Manipulating Reductions
 
 .. autofunction:: realize_reduction
 
+Manipulating Instructions
+-------------------------
+
+.. autofunction:: set_instruction_priority
+
+.. autofunction:: add_dependency
+
 Finishing up
 ------------
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 43c13f8f6..dc6c10c85 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -263,6 +263,9 @@ def tag_dimensions(kernel, iname_to_tag, force=False):
 
     new_iname_to_tag = kernel.iname_to_tag.copy()
     for iname, new_tag in iname_to_tag.iteritems():
+        if iname not in kernel.all_inames():
+            raise RuntimeError("iname '%s' does not exist" % iname)
+
         old_tag = kernel.iname_to_tag.get(iname)
 
         retag_ok = False
@@ -422,6 +425,79 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     else:
         return new_kernel
 
+# }}}
+
+# {{{ instruction processing
+
+class _IdMatch(object):
+    def __init__(self, value):
+        self.value = value
+
+class _ExactIdMatch(_IdMatch):
+    def __call__(self, insn):
+        return insn.id == self.value
+
+class _ReIdMatch:
+    def __call__(self, insn):
+        return self.value.match(insn.id) is not None
+
+def _parse_insn_match(insn_match):
+    import re
+    colon_idx = insn_match.find(":")
+    if colon_idx == -1:
+        return _ExactIdMatch(insn_match)
+
+    match_tp = insn_match[:colon_idx]
+    match_val = insn_match[colon_idx+1:]
+
+    if match_tp == "glob":
+        from fnmatch import translate
+        return _ReIdMatch(re.compile(translate(match_val)))
+    elif match_tp == "re":
+        return _ReIdMatch(re.compile(match_val))
+    else:
+        raise ValueError("match type '%s' not understood" % match_tp)
+
+
+
+
+def find_instructions(kernel, insn_match):
+    match = _parse_insn_match(insn_match)
+    return [insn for insn in kernel.instructions if match(insn)]
+
+def map_instructions(kernel, insn_match, f):
+    match = _parse_insn_match(insn_match)
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        if match(insn):
+            new_insns.append(f(insn))
+        else:
+            new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+def set_instruction_priority(kernel, insn_match, priority):
+    """Set the priority of instructions matching *insn_match* to *priority*.
+
+    *insn_match* may be an instruction id, a regular expression prefixed by `re:`,
+    or a file-name-style glob prefixed by `glob:`.
+    """
+
+    def set_prio(insn): return insn.copy(priority=priority)
+    return map_instructions(kernel, insn_match, set_prio)
+
+def add_dependency(kernel, insn_match, dependency):
+    """Add the instruction dependency *dependency* to the instructions matched
+    by *insn_match*.
+
+    *insn_match* may be an instruction id, a regular expression prefixed by `re:`,
+    or a file-name-style glob prefixed by `glob:`.
+    """
+
+    def add_dep(insn): return insn.copy(insn_deps=insn.insn_deps + [dependency])
+    return map_instructions(kernel, insn_match, add_dep)
 
 # }}}
 
diff --git a/loopy/kernel.py b/loopy/kernel.py
index a6d8095bb..c14459298 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -296,6 +296,7 @@ class Instruction(Record):
         of the program. Allowed values are *None* (for unknown), *True*, and *False*.
     :ivar boostable_into: a set of inames into which the instruction
         may need to be boosted, as a heuristic help for the scheduler.
+    :ivar priority: scheduling priority
 
     The following two instance variables are only used until :func:`loopy.make_kernel` is
     finished:
@@ -309,7 +310,8 @@ class Instruction(Record):
             id, assignee, expression,
             forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
             boostable_into=None,
-            temp_var_type=None, duplicate_inames_and_tags=[]):
+            temp_var_type=None, duplicate_inames_and_tags=[],
+            priority=0):
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -325,7 +327,9 @@ class Instruction(Record):
                 forced_iname_deps=forced_iname_deps,
                 insn_deps=insn_deps, boostable=boostable,
                 boostable_into=boostable_into,
-                temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags)
+                temp_var_type=temp_var_type,
+                duplicate_inames_and_tags=duplicate_inames_and_tags,
+                priority=priority)
 
     @memoize_method
     def reduction_inames(self):
@@ -358,8 +362,12 @@ class Instruction(Record):
         else:
             raise RuntimeError("unexpected value for Instruction.boostable")
 
+        options = []
+
         if self.insn_deps:
-            result += "\n    : " + ", ".join(self.insn_deps)
+            options.append("deps="+":".join(self.insn_deps))
+        if self.priority:
+            options.append("priority=%d" % self.priority)
 
         return result
 
@@ -644,7 +652,7 @@ class LoopKernel(Record):
         were applied to the kernel. These are stored so that they may be repeated
         on expressions the user specifies later.
     :ivar cache_manager:
-    :ivar lowest_priority_inames:
+    :ivar lowest_priority_inames: (used internally to realize ILP)
     :ivar breakable_inames: these inames' loops may be broken up by the scheduler
 
     The following instance variables are only used until :func:`loopy.make_kernel` is
@@ -695,14 +703,13 @@ class LoopKernel(Record):
         INAME_ENTRY_RE = re.compile(
                 r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$")
         INSN_RE = re.compile(
-                r"^\s*(?:(?P<label>\w+):)?"
                 "\s*(?:\["
                     "(?P<iname_deps_and_tags>[\s\w,:.]*)"
                     "(?:\|(?P<duplicate_inames_and_tags>[\s\w,:.]*))?"
                 "\])?"
                 "\s*(?:\<(?P<temp_var_type>.*?)\>)?"
                 "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
-                "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$"
+                "\s*?(?:\{(?P<options>[\s\w=,:]+)\}\s*)?$"
                 )
         SUBST_RE = re.compile(
                 r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$"
@@ -738,7 +745,7 @@ class LoopKernel(Record):
             insn_match = INSN_RE.match(insn)
             subst_match = SUBST_RE.match(insn)
             if insn_match is not None and subst_match is not None:
-                raise RuntimeError("insn parse error")
+                raise RuntimeError("instruction parse error: %s" % insn)
 
             if insn_match is not None:
                 groups = insn_match.groupdict()
@@ -752,15 +759,33 @@ class LoopKernel(Record):
             rhs = parse(groups["rhs"])
 
             if insn_match is not None:
-                if groups["label"] is not None:
-                    label = groups["label"]
-                else:
-                    label = "insn"
-
-                if groups["insn_deps"] is not None:
-                    insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(","))
-                else:
-                    insn_deps = set()
+                insn_deps = set()
+                insn_id = "insn"
+                priority = 0
+
+                if groups["options"] is not None:
+                    for option in groups["options"].split(","):
+                        option = option.strip()
+                        if not option:
+                            raise RuntimeError("empty option supplied")
+
+                        equal_idx = option.find("=")
+                        if equal_idx == -1:
+                            opt_key = option
+                            opt_value = None
+                        else:
+                            opt_key = option[:equal_idx].strip()
+                            opt_value = option[equal_idx+1:].strip()
+
+                        if opt_key == "id":
+                            insn_id = opt_value
+                        elif opt_key == "priority":
+                            priority = int(opt_value)
+                        elif opt_key == "dep":
+                            insn_deps = opt_value.split(":")
+                        else:
+                            raise ValueError("unrecognized instruction option '%s'"
+                                    % opt_key)
 
                 if groups["iname_deps_and_tags"] is not None:
                     inames_and_tags = parse_iname_and_tag_list(
@@ -792,12 +817,14 @@ class LoopKernel(Record):
 
                 parsed_instructions.append(
                         Instruction(
-                            id=self.make_unique_instruction_id(parsed_instructions, based_on=label),
+                            id=self.make_unique_instruction_id(
+                                parsed_instructions, based_on=insn_id),
                             insn_deps=insn_deps,
                             forced_iname_deps=forced_iname_deps,
                             assignee=lhs, expression=rhs,
                             temp_var_type=temp_var_type,
-                            duplicate_inames_and_tags=duplicate_inames_and_tags))
+                            duplicate_inames_and_tags=duplicate_inames_and_tags,
+                            priority=priority))
 
             elif subst_match is not None:
                 from pymbolic.primitives import Variable, Call
@@ -1378,8 +1405,8 @@ class LoopKernel(Record):
             all_inames_by_insns |= self.insn_inames(insn)
 
         if not all_inames_by_insns <= self.all_inames():
-            raise RuntimeError("inames collected from instructions (%s) "
-                    "that are not present in domain (%s)"
+            raise RuntimeError("some inames collected from instructions (%s) "
+                    "are not present in domain (%s)"
                     % (", ".join(sorted(all_inames_by_insns)),
                         ", ".join(sorted(self.all_inames()))))
 
@@ -1552,14 +1579,20 @@ class LoopKernel(Record):
         loop_list_width = 35
         for insn in self.instructions:
             loop_list = ",".join(sorted(self.insn_inames(insn)))
+
+            options = [insn.id]
+            if insn.priority:
+                options.append("priority=%d" % insn.priority)
+
             if len(loop_list) > loop_list_width:
                 lines.append("[%s]" % loop_list)
                 lines.append("%s%s <- %s   # %s" % (
-                    (loop_list_width+2)*" ", insn.assignee, insn.expression, insn.id))
+                    (loop_list_width+2)*" ", insn.assignee,
+                    insn.expression, ", ".join(options)))
             else:
                 lines.append("[%s]%s%s <- %s   # %s" % (
                     loop_list, " "*(loop_list_width-len(loop_list)),
-                    insn.assignee, insn.expression, insn.id))
+                    insn.assignee, insn.expression, ", ".join(options)))
 
         lines.append(sep)
         lines.append("DEPENDENCIES:")
diff --git a/loopy/schedule.py b/loopy/schedule.py
index a6bc240c4..62b0214ce 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -236,12 +236,18 @@ class ScheduleDebugger:
 
 # {{{ scheduling algorithm
 
-def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_boost=False, debug=None):
+def generate_loop_schedules_internal(kernel, loop_priority, schedule=[],
+        allow_boost=False, allow_insn=False, debug=None):
+    # allow_insn is set to False initially and after entering each loop
+    # to give loops containing high-priority instructions a chance.
+
     all_insn_ids = set(insn.id for insn in kernel.instructions)
 
     scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule
             if isinstance(sched_item, RunInstruction))
 
+    unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids
+
     if allow_boost is None:
         rec_allow_boost = None
     else:
@@ -298,21 +304,22 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
     # }}}
 
-    # {{{ see if any insn can be scheduled now
+    # {{{ see if any insns are ready to be scheduled now
 
     # Also take note of insns that have a chance of being schedulable inside
     # the current loop nest, in this set:
 
     reachable_insn_ids = set()
 
-    unscheduled_insn_ids = all_insn_ids - scheduled_insn_ids
+    for insn_id in sorted(unscheduled_insn_ids,
+            key=lambda insn_id: kernel.id_to_insn[insn_id].priority,
+            reverse=True):
 
-    for insn_id in unscheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
 
-        schedule_now = set(insn.insn_deps) <= scheduled_insn_ids
+        is_ready = set(insn.insn_deps) <= scheduled_insn_ids
 
-        if not schedule_now:
+        if not is_ready:
             if debug_mode:
                 print "instruction '%s' is missing insn depedencies '%s'" % (
                         insn.id, ",".join(set(insn.insn_deps) - scheduled_insn_ids))
@@ -330,7 +337,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
             have = have - insn.boostable_into
 
         if want != have:
-            schedule_now = False
+            is_ready = False
 
             if debug_mode:
                 if want-have:
@@ -342,12 +349,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
         # {{{ determine reachability
 
-        if (not schedule_now and have <= want):
+        if (not is_ready and have <= want):
             reachable_insn_ids.add(insn_id)
 
         # }}}
 
-        if schedule_now:
+        if is_ready and allow_insn:
             if debug_mode:
                 print "scheduling '%s'" % insn.id
             scheduled_insn_ids.add(insn.id)
@@ -359,13 +366,12 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
             for sub_sched in generate_loop_schedules_internal(
                     kernel, loop_priority, schedule,
-                    allow_boost=rec_allow_boost, debug=debug):
+                    allow_boost=rec_allow_boost, debug=debug,
+                    allow_insn=True):
                 yield sub_sched
 
             return
 
-    unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids)
-
     # }}}
 
     # {{{ see if we're ready to leave the innermost loop
@@ -413,7 +419,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
                 for sub_sched in generate_loop_schedules_internal(
                         kernel, loop_priority, schedule,
-                        allow_boost=rec_allow_boost, debug=debug):
+                        allow_boost=rec_allow_boost, debug=debug,
+                        allow_insn=allow_insn):
                     yield sub_sched
 
                 return
@@ -443,7 +450,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         print 75*"-"
 
     if needed_inames:
-        useful_loops = []
+        iname_to_usefulness = {}
 
         for iname in needed_inames:
 
@@ -483,7 +490,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
             # {{{ determine if that gets us closer to being able to schedule an insn
 
-            useful = False
+            usefulness = None # highest insn priority enabled by iname
 
             hypothetically_active_loops = active_inames_set | set([iname])
             for insn_id in reachable_insn_ids:
@@ -492,15 +499,17 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                 want = kernel.insn_inames(insn) | insn.boostable_into
 
                 if hypothetically_active_loops <= want:
-                    useful = True
-                    break
+                    if usefulness is None:
+                        usefulness = insn.priority
+                    else:
+                        usefulness = max(usefulness, insn.priority)
 
-            if not useful:
+            if usefulness is None:
                 if debug_mode:
                     print "iname '%s' deemed not useful" % iname
                 continue
 
-            useful_loops.append(iname)
+            iname_to_usefulness[iname] = usefulness
 
             # }}}
 
@@ -511,7 +520,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
 
         loop_priority_set = set(loop_priority)
         lowest_priority_set = set(kernel.lowest_priority_inames)
-        useful_loops_set = set(useful_loops)
+        useful_loops_set = set(iname_to_usefulness.iterkeys())
         useful_and_desired = useful_loops_set & loop_priority_set
 
         if useful_and_desired:
@@ -521,27 +530,29 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                     and iname not in kernel.lowest_priority_inames]
 
             priority_tiers.append(
-                    set(useful_loops)
+                    useful_loops_set
                     - loop_priority_set
                     - lowest_priority_set)
         else:
-            priority_tiers = [set(useful_loops) - lowest_priority_set]
+            priority_tiers = [useful_loops_set - lowest_priority_set]
 
         priority_tiers.extend([
             [iname]
             for iname in kernel.lowest_priority_inames
-            if iname in useful_loops
+            if iname in useful_loops_set
             ])
 
         # }}}
 
         if debug_mode:
-            print "useful inames: %s" % ",".join(useful_loops)
+            print "useful inames: %s" % ",".join(useful_loops_set)
 
         for tier in priority_tiers:
             found_viable_schedule = False
 
-            for iname in tier:
+            for iname in sorted(tier,
+                    key=lambda iname: iname_to_usefulness.get(iname, 0),
+                    reverse=True):
                 new_schedule = schedule + [EnterLoop(iname=iname)]
 
                 for sub_sched in generate_loop_schedules_internal(
@@ -567,11 +578,20 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
         yield schedule
 
     else:
+        if not allow_insn:
+            # try again with boosting allowed
+            for sub_sched in generate_loop_schedules_internal(
+                    kernel, loop_priority, schedule=schedule,
+                    allow_boost=allow_boost, debug=debug,
+                    allow_insn=True):
+                yield sub_sched
+
         if not allow_boost and allow_boost is not None:
             # try again with boosting allowed
             for sub_sched in generate_loop_schedules_internal(
                     kernel, loop_priority, schedule=schedule,
-                    allow_boost=True, debug=debug):
+                    allow_boost=True, debug=debug,
+                    allow_insn=allow_insn):
                 yield sub_sched
         else:
             # dead end
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 74bd1c501..6cad0b080 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -240,7 +240,7 @@ def test_variable_size_matrix_mul(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "[n] -> {[i,j,k]: 0<=i,j,k<n}",
             [
-                "label: c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j]) {id=labl}"
                 ],
             [
                 lp.GlobalArg("a", dtype, shape=(n, n), order=order),
@@ -291,7 +291,7 @@ def test_rank_one(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "[n] -> {[i,j]: 0<=i,j<n}",
             [
-                "label: c[i, j] = a[i]*b[j]"
+                "c[i, j] = a[i]*b[j] {id=mylabel, priority =5}"
                 ],
             [
                 lp.GlobalArg("a", dtype, shape=(n,), order=order),
@@ -478,62 +478,48 @@ def test_intel_matrix_mul(ctx_factory):
 
 
 def test_magma_fermi_matrix_mul(ctx_factory):
-    1/0 # not updated to new conventions
-
     dtype = np.float32
     ctx = ctx_factory()
     order = "C"
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    n = 6*16*16
+    n = get_suitable_size(ctx)
 
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
+                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                 ],
             [
-                lp.ImageArg("a", dtype, 2),
-                lp.ImageArg("b", dtype, 2),
+                lp.ImageArg("a", dtype, shape=(n, n)),
+                lp.ImageArg("b", dtype, shape=(n, n)),
                 lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                 ],
             name="matmul")
 
+    seq_knl = knl
+
     i_reg = 4
     j_reg = 4
     i_chunks = 16
     j_chunks = 16
+
+
     knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0")
     knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
     knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1")
     knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
     knl = lp.split_dimension(knl, "k", 16)
-    #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr")
-    knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")])
-    knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
+    knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr")
+    # FIXME
+    #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"])
+    #knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
     kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
-    a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    a_img = cl.image_from_array(ctx, a.get(), 1)
-    b_img = cl.image_from_array(ctx, b.get(), 1)
-    c = cl_array.empty_like(a)
-    refsol = np.dot(a.get(), b.get())
-
-    def launcher(kernel, gsize, lsize, check):
-        evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
-                g_times_l=True)
-
-        if check:
-            check_error(refsol, c.get())
-
-        return evt
-
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
+            op_count=[2*n**3/1e9], op_label=["GFlops"],
+            parameters={})
 
 
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 0639bbfec..ffad62f6a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -41,7 +41,7 @@ def test_wg_too_small(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i]: 0<=i<100}",
             [
-                "[i:l.0] <float32> z[i] = a[i]"
+                "[i:l.0] <float32> z[i] = a[i] {id=copy}"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
             local_sizes={0: 16})
diff --git a/proto-tests/test_sem_reagan.py b/test/test_sem_reagan.py
similarity index 100%
rename from proto-tests/test_sem_reagan.py
rename to test/test_sem_reagan.py
-- 
GitLab