From ca93d84e42f1b271a7a596c860b16614a26481f2 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 29 Oct 2011 20:10:42 -0400
Subject: [PATCH] A few fixes. Some code shifting. Loosen up owed_barriers
 checking.

---
 loopy/__init__.py   | 270 ++++++++++++++++++++++++++++++------------
 loopy/check.py      |  80 +++++++++++++
 loopy/compiled.py   |   1 -
 loopy/kernel.py     | 277 ++++++++------------------------------------
 loopy/schedule.py   |  11 +-
 loopy/symbolic.py   |   6 +-
 test/test_linalg.py |  43 ++++---
 test/test_loopy.py  |  47 ++++++++
 8 files changed, 403 insertions(+), 332 deletions(-)
 create mode 100644 test/test_loopy.py

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 2b8dba210..aab4f1083 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,15 +22,209 @@ class LoopyAdvisory(UserWarning):
 
 from loopy.kernel import ScalarArg, ArrayArg, ImageArg
 
-from loopy.kernel import make_kernel, AutoFitLocalIndexTag
+from loopy.kernel import AutoFitLocalIndexTag
 from loopy.preprocess import preprocess_kernel
 from loopy.schedule import generate_loop_schedules
 from loopy.compiled import CompiledKernel, drive_timing_run
+from loopy.check import check_kernels
+
+# }}}
+
+# {{{ kernel creation
+
+def make_kernel(*args, **kwargs):
+    """Second pass of kernel creation. Think about requests for iname duplication
+    and temporary variable declaration received as part of string instructions.
+    """
+
+    from loopy.kernel import LoopKernel
+    knl = LoopKernel(*args, **kwargs)
+
+    knl = tag_dimensions(
+            knl.copy(iname_to_tag_requests=None),
+            knl.iname_to_tag_requests)
+
+    new_insns = []
+    new_domain = knl.domain
+    new_temp_vars = knl.temporary_variables.copy()
+    new_iname_to_tag = knl.iname_to_tag.copy()
+
+    newly_created_vars = set()
+
+    # {{{ reduction iname duplication helper function
+
+    def duplicate_reduction_inames(reduction_expr, rec):
+        duplicate_inames = [iname
+                for iname, tag in insn.duplicate_inames_and_tags]
+
+        child = rec(reduction_expr.expr)
+        new_red_inames = []
+        did_something = False
+
+        for iname in reduction_expr.inames:
+            if iname in duplicate_inames:
+                new_iname = knl.make_unique_var_name(iname, newly_created_vars)
+
+                old_insn_inames.append(iname)
+                new_insn_inames.append(new_iname)
+                newly_created_vars.add(new_iname)
+                new_red_inames.append(new_iname)
+                did_something = True
+            else:
+                new_red_inames.append(iname)
+
+        if did_something:
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+            from pymbolic import var
+            subst_dict = dict(
+                    (old_iname, var(new_iname))
+                    for old_iname, new_iname in zip(
+                        reduction_expr.inames, new_red_inames))
+            subst_map = SubstitutionMapper(make_subst_func(subst_dict))
+
+            child = subst_map(child)
+
+            for old_iname, new_iname in zip(reduction_expr.inames, new_red_inames):
+                new_iname_to_tag[new_iname] = insn_dup_iname_to_tag[old_iname]
+
+        from loopy.symbolic import Reduction
+        return Reduction(
+                operation=reduction_expr.operation,
+                inames=tuple(new_red_inames),
+                expr=child)
+
+    # }}}
+
+    for insn in knl.instructions:
+        # {{{ iname duplication
+
+        if insn.duplicate_inames_and_tags:
+
+            insn_dup_iname_to_tag = dict(insn.duplicate_inames_and_tags)
+
+            # {{{ duplicate non-reduction inames
+
+            reduction_inames = insn.reduction_inames()
+
+            duplicate_inames = [iname
+                    for iname, tag in insn.duplicate_inames_and_tags
+                    if iname not in reduction_inames]
+
+            new_inames = [
+                    knl.make_unique_var_name(
+                        iname,
+                        extra_used_vars=
+                        newly_created_vars)
+                    for iname in duplicate_inames]
+
+            for old_iname, new_iname in zip(duplicate_inames, new_inames):
+                new_tag = insn_dup_iname_to_tag[old_iname]
+                if new_tag is None:
+                    new_tag = AutoFitLocalIndexTag()
+                new_iname_to_tag[new_iname] = new_tag
+
+            newly_created_vars.update(new_inames)
+
+            from loopy.isl_helpers import duplicate_axes
+            new_domain = duplicate_axes(new_domain, duplicate_inames, new_inames)
+
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+            from pymbolic import var
+            old_to_new = dict(
+                    (old_iname, var(new_iname))
+                    for old_iname, new_iname in zip(duplicate_inames, new_inames))
+            subst_map = SubstitutionMapper(make_subst_func(old_to_new))
+            new_expression = subst_map(insn.expression)
+
+            # }}}
+
+            # {{{ duplicate reduction inames
+
+            if len(duplicate_inames) < len(insn.duplicate_inames_and_tags):
+                # there must've been requests to duplicate reduction inames
+                old_insn_inames = []
+                new_insn_inames = []
+
+                from loopy.symbolic import ReductionCallbackMapper
+                new_expression = (
+                        ReductionCallbackMapper(duplicate_reduction_inames)
+                        (new_expression))
+
+                from loopy.isl_helpers import duplicate_axes
+                for old, new in zip(old_insn_inames, new_insn_inames):
+                    new_domain = duplicate_axes(new_domain, [old], [new])
+
+            # }}}
+
+            insn = insn.copy(
+                    assignee=subst_map(insn.assignee),
+                    expression=new_expression,
+                    forced_iname_deps=[
+                        old_to_new.get(iname, iname) for iname in insn.forced_iname_deps],
+                    )
+
+        # }}}
+
+        # {{{ temporary variable creation
+
+        from loopy.kernel import (
+                find_var_base_indices_and_shape_from_inames,
+                TemporaryVariable)
+
+        if insn.temp_var_type is not None:
+            assignee_name = insn.get_assignee_var_name()
+
+            assignee_indices = []
+            from pymbolic.primitives import Variable
+            for index_expr in insn.get_assignee_indices():
+                if (not isinstance(index_expr, Variable)
+                        or not index_expr.name in insn.all_inames()):
+                    raise RuntimeError(
+                            "only plain inames are allowed in "
+                            "the lvalue index when declaring the "
+                            "variable '%s' in an instruction"
+                            % assignee_name)
+
+                assignee_indices.append(index_expr.name)
+
+            from loopy.kernel import LocalIndexTagBase
+            from pytools import any
+            is_local = any(
+                    isinstance(new_iname_to_tag.get(iname), LocalIndexTagBase)
+                    for iname in assignee_indices)
+
+            base_indices, shape = \
+                    find_var_base_indices_and_shape_from_inames(
+                            new_domain, assignee_indices)
+
+            new_temp_vars[assignee_name] = TemporaryVariable(
+                    name=assignee_name,
+                    dtype=np.dtype(insn.temp_var_type),
+                    is_local=is_local,
+                    base_indices=base_indices,
+                    shape=shape)
+
+            newly_created_vars.add(assignee_name)
+
+            insn = insn.copy(temp_var_type=None)
+
+        # }}}
+
+        new_insns.append(insn)
+
+    return knl.copy(
+            instructions=new_insns,
+            domain=new_domain,
+            temporary_variables=new_temp_vars,
+            iname_to_tag=new_iname_to_tag)
 
 # }}}
 
 # {{{ user-facing kernel manipulation functionality
 
+
 def split_dimension(kernel, iname, inner_length, padded_length=None,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
@@ -407,80 +601,6 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
 
 
-def get_problems(kernel, parameters):
-    """
-    :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5.
-        '5' means 'will certainly not run'.
-    """
-    msgs = []
-
-    def msg(severity, s):
-        msgs.append((severity, s))
-
-    glens, llens = kernel.get_grid_sizes_as_exprs()
-
-    from pymbolic import evaluate
-    from pymbolic.mapper.evaluator import UnknownVariableError
-    try:
-        glens = evaluate(glens, parameters)
-        llens = evaluate(llens, parameters)
-    except UnknownVariableError, name:
-        raise RuntimeError("When checking your kernel for problems, "
-                "a value for parameter '%s' was not available. Pass "
-                "it in the 'parameters' kwarg to check_kernels()."
-                % name)
-
-    if (max(len(glens), len(llens))
-            > kernel.device.max_work_item_dimensions):
-        msg(5, "too many work item dimensions")
-
-    for i in range(len(llens)):
-        if llens[i] > kernel.device.max_work_item_sizes[i]:
-            msg(5, "group axis %d too big" % i)
-
-    from pytools import product
-    if product(llens) > kernel.device.max_work_group_size:
-        msg(5, "work group too big")
-
-    import pyopencl as cl
-    from pyopencl.characterize import usable_local_mem_size
-    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
-        if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL:
-            msg(5, "using too much local memory")
-        else:
-            msg(4, "using more local memory than available--"
-                    "possibly OK due to cache nature")
-
-    const_arg_count = sum(
-            1 for arg in kernel.args
-            if isinstance(arg, ArrayArg) and arg.constant_mem)
-
-    if const_arg_count > kernel.device.max_constant_args:
-        msg(5, "too many constant arguments")
-
-    max_severity = 0
-    for sev, msg in msgs:
-        max_severity = max(sev, max_severity)
-    return max_severity, msgs
-
-
-
-
-def check_kernels(kernel_gen, parameters, kill_level_min=3,
-        warn_level_min=1):
-    for kernel in kernel_gen:
-        max_severity, msgs = get_problems(kernel, parameters)
-
-        for severity, msg in msgs:
-            if severity >= warn_level_min:
-                from warnings import warn
-                from loopy import LoopyAdvisory
-                warn(msg, LoopyAdvisory)
-
-        if max_severity < kill_level_min:
-            yield kernel
-
-# }}}
 
 # {{{ convenience
 
diff --git a/loopy/check.py b/loopy/check.py
index 996ad9f48..f7932a098 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -212,3 +212,83 @@ def check_implemented_domains(kernel, implemented_domains):
     return True
 
 # }}}
+
+# {{{ user-invoked checks
+
+def get_problems(kernel, parameters):
+    """
+    :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5.
+        '5' means 'will certainly not run'.
+    """
+    msgs = []
+
+    def msg(severity, s):
+        msgs.append((severity, s))
+
+    glens, llens = kernel.get_grid_sizes_as_exprs()
+
+    from pymbolic import evaluate
+    from pymbolic.mapper.evaluator import UnknownVariableError
+    try:
+        glens = evaluate(glens, parameters)
+        llens = evaluate(llens, parameters)
+    except UnknownVariableError, name:
+        raise RuntimeError("When checking your kernel for problems, "
+                "a value for parameter '%s' was not available. Pass "
+                "it in the 'parameters' kwarg to check_kernels()."
+                % name)
+
+    if (max(len(glens), len(llens))
+            > kernel.device.max_work_item_dimensions):
+        msg(5, "too many work item dimensions")
+
+    for i in range(len(llens)):
+        if llens[i] > kernel.device.max_work_item_sizes[i]:
+            msg(5, "group axis %d too big" % i)
+
+    from pytools import product
+    if product(llens) > kernel.device.max_work_group_size:
+        msg(5, "work group too big")
+
+    import pyopencl as cl
+    from pyopencl.characterize import usable_local_mem_size
+    if kernel.local_mem_use() > usable_local_mem_size(kernel.device):
+        if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL:
+            msg(5, "using too much local memory")
+        else:
+            msg(4, "using more local memory than available--"
+                    "possibly OK due to cache nature")
+
+    from loopy.kernel import ArrayArg
+    const_arg_count = sum(
+            1 for arg in kernel.args
+            if isinstance(arg, ArrayArg) and arg.constant_mem)
+
+    if const_arg_count > kernel.device.max_constant_args:
+        msg(5, "too many constant arguments")
+
+    max_severity = 0
+    for sev, msg in msgs:
+        max_severity = max(sev, max_severity)
+    return max_severity, msgs
+
+
+
+
+def check_kernels(kernel_gen, parameters={}, kill_level_min=5,
+        warn_level_min=1):
+    for kernel in kernel_gen:
+        max_severity, msgs = get_problems(kernel, parameters)
+
+        for severity, msg in msgs:
+            if severity >= warn_level_min:
+                from warnings import warn
+                from loopy import LoopyAdvisory
+                warn(msg, LoopyAdvisory)
+
+        if max_severity < kill_level_min:
+            yield kernel
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/compiled.py b/loopy/compiled.py
index a263ea8f2..0c6ec7580 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -48,7 +48,6 @@ class CompiledKernel:
         else:
             self.size_args = size_args
 
-        from loopy.kernel import GroupIndexTag, LocalIndexTag
         gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs()
 
         if not gsize_expr: gsize_expr = (1,)
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 036b1257c..bc4588b87 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -305,14 +305,15 @@ class Instruction(Record):
         from pymbolic.primitives import Variable, Subscript
 
         if isinstance(self.assignee, Variable):
-            result = ()
+            return ()
         elif isinstance(self.assignee, Subscript):
             result = self.assignee.index
+            if not isinstance(result, tuple):
+                result = (result,)
+            return result
         else:
             raise RuntimeError("invalid lvalue '%s'" % self.assignee)
 
-        return result
-
     @memoize_method
     def get_read_var_names(self):
         from loopy.symbolic import DependencyMapper
@@ -437,6 +438,10 @@ class LoopKernel(Record):
         'bulk' slabs with fewer conditionals.
     :ivar temporary_variables:
     :ivar iname_to_tag:
+
+    The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is
+    finished:
+    :ivar iname_to_tag_requests:
     """
 
     def __init__(self, device, domain, instructions, args=None, schedule=None,
@@ -445,14 +450,12 @@ class LoopKernel(Record):
             iname_slab_increments={},
             temporary_variables={},
             workgroup_size=None,
-            iname_to_dim=None,
-            iname_to_tag={},
-            ):
+            iname_to_tag={}, iname_to_tag_requests=None):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
         """
-        assert iname_to_dim is None
+        assert not iname_to_tag_requests
 
         import re
 
@@ -460,12 +463,14 @@ class LoopKernel(Record):
             ctx = isl.Context()
             domain = isl.Set.read_from_str(ctx, domain)
 
-        DUP_ENTRY_RE = re.compile(
+        iname_to_tag_requests = {}
+
+        INAME_ENTRY_RE = re.compile(
                 r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$")
         LABEL_DEP_RE = re.compile(
                 r"^\s*(?:(?P<label>\w+):)?"
                 "\s*(?:\["
-                    "(?P<iname_deps>[\s\w,]*)"
+                    "(?P<iname_deps_and_tags>[\s\w,:.]*)"
                     "(?:\|(?P<duplicate_inames_and_tags>[\s\w,:.]*))?"
                 "\])?"
                 "\s*(?:\<(?P<temp_var_type>.+)\>)?"
@@ -473,10 +478,35 @@ class LoopKernel(Record):
                 "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$"
                 )
 
+        def parse_iname_and_tag_list(s):
+            dup_entries = [
+                    dep.strip() for dep in s.split(",")]
+            result = []
+            for entry in dup_entries:
+                if not entry:
+                    continue
+
+                entry_match = INAME_ENTRY_RE.match(entry)
+                if entry_match is None:
+                    raise RuntimeError(
+                            "could not parse iname:tag entry '%s'"
+                            % entry)
+
+                groups = entry_match.groupdict()
+                iname = groups["iname"]
+                assert iname
+
+                tag = None
+                if groups["tag"] is not None:
+                    tag = parse_tag(groups["tag"])
+
+                result.append((iname, tag))
+
+            return result
+
         def parse_if_necessary(insn):
             from pymbolic import parse
 
-
             if isinstance(insn, Instruction):
                 return insn
             if isinstance(insn, str):
@@ -494,35 +524,17 @@ class LoopKernel(Record):
                 else:
                     insn_deps = []
 
-                if groups["iname_deps"] is not None:
-                    forced_iname_deps = [dep.strip()
-                            for dep in groups["iname_deps"].split(",")
-                            if dep.strip()]
+                if groups["iname_deps_and_tags"] is not None:
+                    inames_and_tags = parse_iname_and_tag_list(
+                            groups["iname_deps_and_tags"])
+                    forced_iname_deps = [iname for iname, tag in inames_and_tags]
+                    iname_to_tag_requests.update(dict(inames_and_tags))
                 else:
                     forced_iname_deps = []
 
                 if groups["duplicate_inames_and_tags"] is not None:
-                    dup_entries = [
-                            dep.strip() for dep in groups["duplicate_inames_and_tags"].split(",")]
-                    duplicate_inames_and_tags = []
-                    for dup_entry in dup_entries:
-                        if not dup_entry:
-                            continue
-
-                        dup_entry_match = DUP_ENTRY_RE.match(dup_entry)
-                        if dup_entry_match is None:
-                            raise RuntimeError(
-                                    "could not parse iname duplication entry '%s'"
-                                    % dup_entry)
-
-                        dup_groups = dup_entry_match.groupdict()
-                        dup_iname = dup_groups["iname"]
-                        assert dup_iname
-                        dup_tag = None
-                        if dup_groups["tag"] is not None:
-                            dup_tag = parse_tag(dup_groups["tag"])
-
-                        duplicate_inames_and_tags.append((dup_iname, dup_tag))
+                    duplicate_inames_and_tags = parse_iname_and_tag_list(
+                            groups["duplicate_inames_and_tags"])
                 else:
                     duplicate_inames_and_tags = []
 
@@ -574,7 +586,8 @@ class LoopKernel(Record):
                 iname_slab_increments=iname_slab_increments,
                 temporary_variables=temporary_variables,
                 workgroup_size=workgroup_size,
-                iname_to_tag=iname_to_tag)
+                iname_to_tag=iname_to_tag,
+                iname_to_tag_requests=iname_to_tag_requests)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
         if insns is None:
@@ -830,196 +843,4 @@ def find_var_base_indices_and_shape_from_inames(domain, inames):
 
 
 
-# {{{ count number of uses of each reduction iname
-
-# }}}
-
-
-
-# {{{ pass 2 of kernel creation
-
-def make_kernel(*args, **kwargs):
-    """Second pass of kernel creation. Think about requests for iname duplication
-    and temporary variable declaration received as part of string instructions.
-    """
-
-    knl = LoopKernel(*args, **kwargs)
-
-    new_insns = []
-    new_domain = knl.domain
-    new_temp_vars = knl.temporary_variables.copy()
-    new_iname_to_tag = knl.iname_to_tag.copy()
-
-    newly_created_vars = set()
-
-    # {{{ reduction iname duplication helper function
-
-    def duplicate_reduction_inames(reduction_expr, rec):
-        duplicate_inames = [iname
-                for iname, tag in insn.duplicate_inames_and_tags]
-
-        child = rec(reduction_expr.expr)
-        new_red_inames = []
-        did_something = False
-
-        for iname in reduction_expr.inames:
-            if iname in duplicate_inames:
-                new_iname = knl.make_unique_var_name(iname, newly_created_vars)
-
-                old_insn_inames.append(iname)
-                new_insn_inames.append(new_iname)
-                newly_created_vars.add(new_iname)
-                new_red_inames.append(new_iname)
-                reduction_iname_uses[iname] -= 1
-                did_something = True
-            else:
-                new_red_inames.append(iname)
-
-        if did_something:
-            from loopy.symbolic import SubstitutionMapper
-            from pymbolic.mapper.substitutor import make_subst_func
-            from pymbolic import var
-            subst_dict = dict(
-                    (old_iname, var(new_iname))
-                    for old_iname, new_iname in zip(
-                        reduction_expr.inames, new_red_inames))
-            subst_map = SubstitutionMapper(make_subst_func(subst_dict))
-
-            child = subst_map(child)
-
-            for old_iname, new_iname in zip(reduction_expr.inames, new_red_inames):
-                new_iname_to_tag[new_iname] = insn_dup_iname_to_tag[old_iname]
-
-        from loopy.symbolic import Reduction
-        return Reduction(
-                operation=reduction_expr.operation,
-                inames=tuple(new_red_inames),
-                expr=child)
-
-    # }}}
-
-    for insn in knl.instructions:
-        # {{{ iname duplication
-
-        if insn.duplicate_inames_and_tags:
-
-            insn_dup_iname_to_tag = dict(insn.duplicate_inames_and_tags)
-
-            # {{{ duplicate non-reduction inames
-
-            reduction_inames = insn.reduction_inames()
-
-            duplicate_inames = [iname
-                    for iname, tag in insn.duplicate_inames_and_tags
-                    if iname not in reduction_inames]
-
-            new_inames = [
-                    knl.make_unique_var_name(
-                        iname,
-                        extra_used_vars=
-                        newly_created_vars)
-                    for iname in duplicate_inames]
-
-            for old_iname, new_iname in zip(duplicate_inames, new_inames):
-                new_tag = insn_dup_iname_to_tag[old_iname]
-                if new_tag is None:
-                    new_tag = AutoFitLocalIndexTag()
-                new_iname_to_tag[new_iname] = new_tag
-
-            newly_created_vars.update(new_inames)
-
-            from loopy.isl_helpers import duplicate_axes
-            new_domain = duplicate_axes(new_domain, duplicate_inames, new_inames)
-
-            from loopy.symbolic import SubstitutionMapper
-            from pymbolic.mapper.substitutor import make_subst_func
-            old_to_new = dict(
-                    (old_iname, var(new_iname))
-                    for old_iname, new_iname in zip(duplicate_inames, new_inames))
-            subst_map = SubstitutionMapper(make_subst_func(old_to_new))
-            new_expression = subst_map(insn.expression)
-
-            # }}}
-
-            # {{{ duplicate reduction inames
-
-            if len(duplicate_inames) < len(insn.duplicate_inames_and_tags):
-                # there must've been requests to duplicate reduction inames
-                old_insn_inames = []
-                new_insn_inames = []
-
-                from loopy.symbolic import ReductionCallbackMapper
-                new_expression = (
-                        ReductionCallbackMapper(duplicate_reduction_inames)
-                        (new_expression))
-
-                from loopy.isl_helpers import duplicate_axes
-                for old, new in zip(old_insn_inames, new_insn_inames):
-                    new_domain = duplicate_axes(new_domain, [old], [new])
-
-            # }}}
-
-            insn = insn.copy(
-                    assignee=subst_map(insn.assignee),
-                    expression=new_expression,
-                    forced_iname_deps=[
-                        old_to_new.get(iname, iname) for iname in insn.forced_iname_deps],
-                    )
-
-        # }}}
-
-        # {{{ temporary variable creation
-
-        if insn.temp_var_type is not None:
-            assignee_name = insn.get_assignee_var_name()
-
-            assignee_indices = []
-            from pymbolic.primitives import Variable
-            for index_expr in insn.get_assignee_indices():
-                if (not isinstance(index_expr, Variable)
-                        or not index_expr.name in insn.all_inames()):
-                    raise RuntimeError(
-                            "only plain inames are allowed in "
-                            "the lvalue index when declaring the "
-                            "variable '%s' in an instruction"
-                            % assignee_name)
-
-                assignee_indices.append(index_expr.name)
-
-            from loopy.kernel import LocalIndexTagBase
-            from pytools import any
-            is_local = any(
-                    isinstance(new_iname_to_tag.get(iname), LocalIndexTagBase)
-                    for iname in assignee_indices)
-
-            base_indices, shape = \
-                    find_var_base_indices_and_shape_from_inames(
-                            new_domain, assignee_indices)
-
-            new_temp_vars[assignee_name] = TemporaryVariable(
-                    name=assignee_name,
-                    dtype=np.dtype(insn.temp_var_type),
-                    is_local=is_local,
-                    base_indices=base_indices,
-                    shape=shape)
-
-            newly_created_vars.add(assignee_name)
-
-            insn = insn.copy(temp_var_type=None)
-
-        # }}}
-
-        new_insns.append(insn)
-
-    return knl.copy(
-            instructions=new_insns,
-            domain=new_domain,
-            temporary_variables=new_temp_vars,
-            iname_to_tag=new_iname_to_tag)
-
-# }}}
-
-
-
-
 # vim: foldmethod=marker
diff --git a/loopy/schedule.py b/loopy/schedule.py
index d61295676..9b597e844 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -410,9 +410,6 @@ def insert_barriers(kernel, schedule, level=0):
             if assignee_temp_var is not None and assignee_temp_var.is_local:
                 dep = get_barrier_dependent_in_schedule(kernel, insn.id, schedule)
 
-                if level == 0:
-                    assert dep
-
                 if dep:
                     issue_barrier(is_pre_barrier=True, dep=dep)
 
@@ -438,7 +435,13 @@ def generate_loop_schedules(kernel, loop_priority=[]):
 
     for gen_sched in generate_loop_schedules_internal(kernel, loop_priority):
         gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
-        assert not owed_barriers
+        if owed_barriers:
+            from warnings import warn
+            from loopy import LoopyAdvisory
+            warn("Barrier insertion finished without inserting barriers for "
+                    "local memory writes in these instructions: '%s'. "
+                    "This often means that local memory was "
+                    "written, but never read." % ",".join(owed_barriers), LoopyAdvisory)
 
         yield kernel.copy(schedule=gen_sched)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 5e08fbd44..e2f48eed9 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -338,9 +338,13 @@ class LoopyCCodeMapper(CCodeMapper):
 
         elif expr.aggregate.name in self.kernel.temporary_variables:
             temp_var = self.kernel.temporary_variables[expr.aggregate.name]
+            if isinstance(expr.index, tuple):
+                index = expr.index
+            else:
+                index = (expr.index,)
 
             return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE)
-                for idx in expr.index))
+                for idx in index))
 
         else:
             raise RuntimeError("nothing known about variable '%s'" % expr.aggregate.name)
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 83b13f74f..265353abe 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -133,7 +133,7 @@ def test_axpy(ctx_factory):
     for variant in [variant_cpu, variant_gpu]:
         kernel_gen = lp.generate_loop_schedules(variant(knl),
                 loop_priority=["i_inner_outer"])
-        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
         def launcher(kernel, gsize, lsize, check):
             evt = kernel(queue, gsize(n), lsize(n), 2, a.data, 3, b.data, c.data, n,
@@ -178,7 +178,7 @@ def test_transpose(ctx_factory):
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, {}, kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {})
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = cl_array.empty_like(a)
@@ -228,7 +228,7 @@ def test_plain_matrix_mul(ctx_factory):
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, {}, kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {})
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -282,7 +282,7 @@ def test_variable_size_matrix_mul(ctx_factory):
     knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -376,7 +376,7 @@ def test_rank_one(ctx_factory):
     for variant in [variant_1, variant_2, variant_4]:
 
         kernel_gen = lp.generate_loop_schedules(variant(knl))
-        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
         a = cl_random.rand(queue, n, dtype=dtype)
         b = cl_random.rand(queue, n, dtype=dtype)
@@ -430,7 +430,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
     knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -492,7 +492,7 @@ def test_intel_matrix_mul(ctx_factory):
 
     kernel_gen = lp.generate_loop_schedules(knl)
     #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -526,7 +526,7 @@ def test_magma_fermi_matrix_mul(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j,k]: 0<=i,j,k<%d}" % n,
             [
-                "c[i, j] = a[i, k]*b[k, j]"
+                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
@@ -539,20 +539,18 @@ def test_magma_fermi_matrix_mul(ctx_factory):
     j_reg = 4
     i_chunks = 16
     j_chunks = 16
-    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True)
-    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True)
-    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True)
-    knl = lp.split_dimension(knl, "k", 16, no_slabs=True)
+    knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0")
+    knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1")
+    knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
+    knl = lp.split_dimension(knl, "k", 16)
     #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr")
     knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")])
     knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),])
-    assert knl.get_problems({})[0] <= 2
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl,
-                hints=["k_outer", "k_inner_outer", "k_inner_inner"]
-                ))
+    kernel_gen = lp.generate_loop_schedules(knl)
+    #hints=["k_outer", "k_inner_outer", "k_inner_inner"]
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -603,10 +601,9 @@ def test_image_matrix_mul(ctx_factory):
     # conflict-free
     knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"])
-    assert knl.get_problems({})[0] <= 2
 
-    kernel_gen = (lp.insert_register_prefetches(knl)
-            for knl in lp.generate_loop_schedules(knl))
+    kernel_gen = lp.generate_loop_schedules(knl)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -662,7 +659,7 @@ def test_image_matrix_mul_ilp(ctx_factory):
     #knl = lp.add_prefetch(knl, 'b', [("j_inner_outer", "j_inner_inner"), "k_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
@@ -716,7 +713,7 @@ def test_fancy_matrix_mul(ctx_factory):
     knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"])
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
     a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, 
             ran_factor=0)
diff --git a/test/test_loopy.py b/test/test_loopy.py
new file mode 100644
index 000000000..28561d56f
--- /dev/null
+++ b/test/test_loopy.py
@@ -0,0 +1,47 @@
+from __future__ import division
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cl_array
+import pyopencl.clrandom as cl_random
+import loopy as lp
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+
+
+
+def test_owed_barriers(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+    queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    knl = lp.make_kernel(ctx.devices[0],
+            "{[i]: 0<=i<100}",
+            [
+                "[i:l.0] <float32> z[i] = a[i]"
+                ],
+            [
+                lp.ArrayArg("a", dtype, shape=(100,)),
+                ])
+
+    kernel_gen = lp.generate_loop_schedules(knl)
+    kernel_gen = lp.check_kernels(kernel_gen)
+
+    for gen_knl in kernel_gen:
+        compiled = lp.CompiledKernel(ctx, gen_knl)
+        print compiled.code
+
+
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
-- 
GitLab