From f29c680a85315f6dc641624c6111661c962efe58 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 2 Oct 2011 18:56:32 +0200
Subject: [PATCH] Automatic axis finding. Rename WORK_ITEM_IDX -> LOCAL_IDX.

---
 MEMO                      |  13 ++-
 loopy/__init__.py         |  12 +--
 loopy/codegen/__init__.py |   8 +-
 loopy/codegen/bounds.py   |   4 +-
 loopy/codegen/prefetch.py |  10 +-
 loopy/compiled.py         |   4 +-
 loopy/isl.py              |  19 +++-
 loopy/kernel.py           | 142 +++++++++++++++++-----------
 loopy/schedule.py         | 192 +++++++++++++++++++++++++++++++++-----
 loopy/symbolic.py         |   4 +-
 10 files changed, 304 insertions(+), 104 deletions(-)

diff --git a/MEMO b/MEMO
index 213114e59..f88800f52 100644
--- a/MEMO
+++ b/MEMO
@@ -1,10 +1,6 @@
 TODO list
 ^^^^^^^^^
 
-Immediately:
-------------
-TODO: Imitate codegen bulk slab handling in bulk slab trials
-
 For writeup:
 ------------
 TODO: Reimplement forced lengths
@@ -61,7 +57,14 @@ Things to consider
 
 - Syntax to declare insn deps
 
-- Make CSE tags replaceable by array names if the access is unique
+- reimplement add_prefetch
+
+- user interface for dim length prescription
+
+- make syntax for explicit loop dependencies
+
+- multiple insns could fight over which iname gets local axis 0
+  -> complicated optimization problem
 
 Dealt with
 ^^^^^^^^^^
diff --git a/loopy/__init__.py b/loopy/__init__.py
index bb6bb6a48..606d579c6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -177,9 +177,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
         parallel_inames = duplicate_inames
 
     dup_iname_to_tag = dup_iname_to_tag.copy()
-    from loopy.kernel import TAG_AUTO_WORK_ITEM_IDX
+    from loopy.kernel import TAG_AUTO_LOCAL_IDX
     for piname in parallel_inames:
-        dup_iname_to_tag[piname] = TAG_AUTO_WORK_ITEM_IDX()
+        dup_iname_to_tag[piname] = TAG_AUTO_LOCAL_IDX()
 
     for diname in duplicate_inames:
         dup_iname_to_tag.setdefault(diname, None)
@@ -211,10 +211,10 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
     target_var_name = kernel.make_unique_var_name(cse_tag)
 
-    from loopy.kernel import (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX,
+    from loopy.kernel import (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX,
             TAG_GROUP_IDX)
     target_var_is_local = any(
-            isinstance(tag, (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX))
+            isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX))
             for tag in dup_iname_to_tag.itervalues())
 
     cse_lookup_table = {}
@@ -254,7 +254,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
             else:
                 tag = kernel.iname_to_tag[iname]
 
-            if isinstance(tag, (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX)):
+            if isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)):
                 kind = "l"
             elif isinstance(tag, TAG_GROUP_IDX):
                 kind = "g"
@@ -507,7 +507,7 @@ def get_problems(kernel, parameters, emit_warnings=True):
         msgs.append((severity, s))
 
     glens = kernel.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True)
-    llens = kernel.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False)
+    llens = kernel.tag_type_lengths(TAG_LOCAL_IDX, allow_parameters=False)
 
     from pymbolic import evaluate
     glens = evaluate(glens, parameters)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 6b9cb966f..1c0e358e4 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -240,10 +240,10 @@ def generate_code(kernel):
 
     # {{{ symbolic names for group and local indices
 
-    from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX
+    from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX
     for what_cls, func in [
             (TAG_GROUP_IDX, "get_group_id"),
-            (TAG_WORK_ITEM_IDX, "get_local_id")]:
+            (TAG_LOCAL_IDX, "get_local_id")]:
         for iname in kernel.ordered_inames_by_tag_type(what_cls):
             lower, upper, equality = kernel.get_bounds(iname, (iname,), allow_parameters=True)
             assert not equality
@@ -275,13 +275,13 @@ def generate_code(kernel):
     body.extend([Line(), gen_code.ast])
     #print "# conditionals: %d" % gen_code.num_conditionals
 
-    from loopy.kernel import TAG_WORK_ITEM_IDX
+    from loopy.kernel import TAG_LOCAL_IDX
     mod.append(
         FunctionBody(
             CLRequiredWorkGroupSize(
                 tuple(dim_length
                     for dim_length in kernel.tag_type_lengths(
-                        TAG_WORK_ITEM_IDX,
+                        TAG_LOCAL_IDX,
                         allow_parameters=False)),
                 CLKernel(FunctionDeclaration(
                     Value("void", kernel.name), args))),
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index e2cc6f0f3..5d89b1048 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -301,8 +301,8 @@ def get_valid_check_vars(kernel, sched_index, allow_ilp, exclude_tag_classes=())
         # prefetches are scheduled, we may not check work item indices
         # (and thereby conceivably mask out some work items).
 
-        from loopy.kernel import TAG_WORK_ITEM_IDX
-        allowed_vars -= set(kernel.inames_by_tag_type(TAG_WORK_ITEM_IDX))
+        from loopy.kernel import TAG_LOCAL_IDX
+        allowed_vars -= set(kernel.inames_by_tag_type(TAG_LOCAL_IDX))
 
     return allowed_vars
 
diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py
index 1b7206a48..c35a11fcb 100644
--- a/loopy/codegen/prefetch.py
+++ b/loopy/codegen/prefetch.py
@@ -207,8 +207,8 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map,
             pf_idx_expr = 0
             for realiz_iname, length in zip(realiz_inames, realiz_lengths):
                 tag = flnd.kernel.iname_to_tag[realiz_iname]
-                from loopy.kernel import TAG_WORK_ITEM_IDX
-                assert isinstance(tag, TAG_WORK_ITEM_IDX)
+                from loopy.kernel import TAG_LOCAL_IDX
+                assert isinstance(tag, TAG_LOCAL_IDX)
 
                 pf_idx_expr = (pf_idx_expr*length
                         + var("(int) get_local_id(%d)" % tag.axis))
@@ -301,8 +301,8 @@ def generate_prefetch_code(kernel, sched_index, exec_domain):
 
     # {{{ first, fix the user-specified fetch dims
 
-    from loopy.kernel import TAG_WORK_ITEM_IDX
-    knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX)
+    from loopy.kernel import TAG_LOCAL_IDX
+    knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_LOCAL_IDX)
     used_kernel_work_item_inames = []
 
     for realization_dim_idx, loc_fetch_axis_list in \
@@ -390,7 +390,7 @@ def generate_prefetch_code(kernel, sched_index, exec_domain):
     from loopy.codegen.bounds import get_valid_check_vars
     valid_index_vars = get_valid_check_vars(kernel, sched_index,
             allow_ilp=True,
-            exclude_tag_classes=(TAG_WORK_ITEM_IDX,))
+            exclude_tag_classes=(TAG_LOCAL_IDX,))
 
     from loopy.symbolic import LoopyCCodeMapper
     flnd = FetchLoopNestData(prefetch=pf,
diff --git a/loopy/compiled.py b/loopy/compiled.py
index a2b4210d6..864683bdd 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -48,11 +48,11 @@ class CompiledKernel:
         else:
             self.size_args = size_args
 
-        from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX
+        from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX
         gsize_expr = tuple(self.kernel.tag_type_lengths(
             TAG_GROUP_IDX, allow_parameters=True))
         lsize_expr = tuple(self.kernel.tag_type_lengths(
-            TAG_WORK_ITEM_IDX, allow_parameters=False))
+            TAG_LOCAL_IDX, allow_parameters=False))
 
         if not gsize_expr: gsize_expr = (1,)
         if not lsize_expr: lsize_expr = (1,)
diff --git a/loopy/isl.py b/loopy/isl.py
index bf7b5a064..6af301a9f 100644
--- a/loopy/isl.py
+++ b/loopy/isl.py
@@ -110,12 +110,21 @@ def set_is_universe(set):
 
 
 
-def static_max_of_pw_aff(pw_aff):
-    for set, aff in pw_aff.get_pieces():
-        candidate_pw_aff = isl.PwAff.from_aff(aff)
+def static_min_of_pw_aff(pw_aff):
+    for set, candidate_aff in pw_aff.get_pieces():
+        if set_is_universe(candidate_aff.le_set(pw_aff)):
+            return candidate_aff
+
+    raise ValueError("a static minimum was not found for PwAff '%s'"
+            % pw_aff)
+
 
-        if set_is_universe(candidate_pw_aff.ge_set(pw_aff)):
-            return aff
+
+
+def static_max_of_pw_aff(pw_aff):
+    for set, candidate_aff in pw_aff.get_pieces():
+        if set_is_universe(candidate_aff.ge_set(pw_aff)):
+            return candidate_aff
 
     raise ValueError("a static maximum was not found for PwAff '%s'"
             % pw_aff)
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 4767f58d4..c2527cf07 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -50,13 +50,19 @@ class ParallelTagWithAxis(ParallelTag, UniqueTag):
         return "%s.%d" % (
                 self.print_name, self.axis)
 
+#class MultiTag(IndexTag):
+    
+#class SplitTag(IndexTag):
+    
+
+
 class TAG_GROUP_IDX(ParallelTagWithAxis):
     print_name = "g"
 
-class TAG_WORK_ITEM_IDX(ParallelTagWithAxis):
+class TAG_LOCAL_IDX(ParallelTagWithAxis):
     print_name = "l"
 
-class TAG_AUTO_WORK_ITEM_IDX(ParallelTag):
+class TAG_AUTO_LOCAL_IDX(ParallelTag):
     def __str__(self):
         return "l.auto"
 
@@ -94,7 +100,7 @@ def parse_tag(tag):
     elif tag.startswith("g."):
         return TAG_GROUP_IDX(int(tag[2:]))
     elif tag.startswith("l."):
-        return TAG_WORK_ITEM_IDX(int(tag[2:]))
+        return TAG_LOCAL_IDX(int(tag[2:]))
     else:
         raise ValueError("cannot parse tag: %s" % tag)
 
@@ -386,7 +392,6 @@ class LoopKernel(Record):
         upper_incr) tuples that will be separated out in the execution to generate
         'bulk' slabs with fewer conditionals.
     :ivar temporary_variables:
-    :ivar workgroup_size:
     :ivar name_to_dim: A lookup table from inames to ISL-style
         (dim_type, index) tuples
     :ivar iname_to_tag:
@@ -518,6 +523,11 @@ class LoopKernel(Record):
             if var_name not in used_vars:
                 return var_name
 
+    @property
+    @memoize_method
+    def arg_dict(self):
+        return dict((arg.name, arg) for arg in self.args)
+
     @property
     @memoize_method
     def dim_to_name(self):
@@ -534,14 +544,6 @@ class LoopKernel(Record):
     def space(self):
         return self.domain.get_space()
 
-    @property
-    @memoize_method
-    def tag_key_to_iname(self):
-        return dict(
-                (tag.key, iname)
-                for iname, tag in self.iname_to_tag.iteritems()
-                if isinstance(tag, UniqueTag))
-
     @property
     @memoize_method
     def arg_dict(self):
@@ -567,57 +569,93 @@ class LoopKernel(Record):
         return [iname for iname in self.all_inames()
                 if isinstance(self.iname_to_tag.get(iname), tag_type)]
 
-    def ordered_inames_by_tag_type(self, tag_type):
-        result = []
-        from itertools import count
-        for i in count():
-            try:
-                dim = self.tag_key_to_iname[tag_type(i).key]
-            except KeyError:
-                return result
+    @memoize_method
+    def get_iname_bounds(self, iname):
+        lower_bound_pw_aff = self.domain.dim_min(self.name_to_dim[iname][1])
+        upper_bound_pw_aff = self.domain.dim_max(self.name_to_dim[iname][1])
+
+        class BoundsRecord(Record):
+            pass
+
+        size = upper_bound_pw_aff - lower_bound_pw_aff + 1
+
+        return BoundsRecord(
+                lower_bound_pw_aff=lower_bound_pw_aff,
+                upper_bound_pw_aff=upper_bound_pw_aff,
+                size=size)
+
+    def fix_grid_sizes(kernel):
+        all_inames_by_insns = set()
+        for insn in kernel.instructions:
+            all_inames_by_insns |= insn.all_inames()
+
+        if all_inames_by_insns != kernel.all_inames():
+            raise RuntimeError("inames collected from instructions "
+                    "do not match domain inames")
+
+        global_sizes = {}
+        local_sizes = {}
+
+        from loopy.kernel import (
+                TAG_GROUP_IDX, TAG_LOCAL_IDX,
+                TAG_AUTO_LOCAL_IDX)
+
+        for iname in kernel.all_inames():
+            tag = kernel.iname_to_tag.get(iname)
+
+            if isinstance(tag, TAG_GROUP_IDX):
+                tgt_dict = global_sizes
+            elif isinstance(tag, TAG_LOCAL_IDX):
+                tgt_dict = local_sizes
+            elif isinstance(tag, TAG_AUTO_LOCAL_IDX):
+                #raise RuntimeError("cannot find grid sizes if AUTO_LOCAL_IDX tags are "
+                        #"present")
+                pass
+                tgt_dict = None
             else:
-                result.append(dim)
+                tgt_dict = None
 
-    @memoize_method
-    def get_bounds_constraints(self, iname, admissible_vars, allow_parameters):
-        """Get an overapproximation of the loop bounds for the variable *iname*."""
+            if tgt_dict is None:
+                continue
 
-        from loopy.codegen.bounds import get_bounds_constraints
-        return get_bounds_constraints(self.domain, iname, admissible_vars,
-                allow_parameters)
+            bounds = kernel.get_iname_bounds(iname)
 
-    @memoize_method
-    def get_bounds(self, iname, admissible_vars, allow_parameters):
-        """Get an overapproximation of the loop bounds for the variable *iname*."""
+            size = bounds.size
 
-        from loopy.codegen.bounds import get_bounds
-        return get_bounds(self.domain, iname, admissible_vars, allow_parameters)
+            from loopy.isl import static_max_of_pw_aff
+            try:
+                size = static_max_of_pw_aff(size)
+            except ValueError:
+                pass
 
-    def tag_type_lengths(self, tag_cls, allow_parameters):
-        def get_length(iname):
-            tag = self.iname_to_tag[iname]
-            if tag.forced_length is not None:
-                return tag.forced_length
+            if tag.axis in tgt_dict:
+                tgt_dict[tag.axis] = tgt_dict[tag.axis].max(size)
+            else:
+                tgt_dict[tag.axis] = size
 
-            lower, upper, equality = self.get_bounds(iname, (iname,), 
-                    allow_parameters=allow_parameters)
-            return upper-lower
+        max_dims = kernel.device.max_work_item_dimensions
 
-        return [get_length(iname)
-                for iname in self.ordered_inames_by_tag_type(tag_cls)]
+        def to_dim_tuple(size_dict, which):
+            size_list = []
+            sorted_axes = sorted(size_dict.iterkeys())
+            while sorted_axes:
+                cur_axis = sorted_axes.pop(0)
+                while cur_axis > len(size_list):
+                    from loopy import LoopyAdvisory
+                    warn("%s axis %d unassigned--assuming length 1" % len(size_list),
+                            LoopyAdvisory)
+                    size_list.append(1)
 
-    def tag_or_iname_to_iname(self, s):
-        try:
-            tag = parse_tag(s)
-        except ValueError:
-            pass
-        else:
-            return self.tag_key_to_iname[tag.key]
+                size_list.append(size_dict[cur_axis])
+
+            if len(size_list) > max_dims:
+                raise ValueError("more %s dimensions assigned than supported "
+                        "by hardware (%d > %d)" % (which, len(size_list), max_dims))
 
-        if s not in self.all_inames():
-            raise RuntimeError("invalid index name '%s'" % s)
+            return tuple(size_list)
 
-        return s
+        return (to_dim_tuple(global_sizes, "global"),
+                to_dim_tuple(local_sizes, "local"))
 
     def local_mem_use(self):
         return sum(lv.nbytes for lv in self.temporary_variables.itervalues()
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 1756b8e66..8ec6393dc 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -24,14 +24,6 @@ class Barrier(Record):
 
 
 
-def fix_grid_sizes(kernel):
-    from warnings import warn
-    warn("fix_grid_sizes is unimplemented")
-    return kernel
-
-
-
-
 def check_double_use_of_hw_dimensions(kernel):
     from loopy.kernel import UniqueTag
 
@@ -124,6 +116,176 @@ def add_automatic_dependencies(kernel):
 
 
 
+def guess_good_iname_for_axis_0(kernel, insn):
+    from loopy.kernel import ImageArg, ScalarArg
+
+    approximate_arg_values = dict(
+            (arg.name, arg.approximately)
+            for arg in kernel.args
+            if isinstance(arg, ScalarArg))
+
+    # {{{ find all array accesses in insn
+
+    from loopy.symbolic import ArrayAccessFinder
+    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))
+
+    from pymbolic.primitives import Subscript
+
+    if isinstance(insn.assignee, Subscript):
+        ary_acc_exprs.append(insn.assignee)
+    print ary_acc_exprs
+
+    # }}}
+
+    # {{{ filter array accesses to only the global ones
+
+    global_ary_acc_exprs = []
+
+    for aae in ary_acc_exprs:
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+        if arg is None:
+            continue
+
+        if isinstance(arg, ImageArg):
+            continue
+
+        global_ary_acc_exprs.append(aae)
+
+    # }}}
+
+    # {{{ figure out which iname should get mapped to local axis 0
+
+    # maps inames to vote counts
+    vote_count_for_l0 = {}
+
+    from loopy.symbolic import CoefficientCollector
+
+    from pytools import argmin2, argmax2
+
+    for aae in global_ary_acc_exprs:
+        index_expr = aae.index
+        if not isinstance(index_expr, tuple):
+            index_expr = (index_expr,)
+
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+
+        ary_strides = arg.strides
+        if ary_strides is None and len(index_expr) == 1:
+            ary_strides = (1,)
+
+        iname_to_stride = {}
+        for iexpr_i, stride in zip(index_expr, ary_strides):
+            coeffs = CoefficientCollector()(iexpr_i)
+            for var_name, coeff in coeffs.iteritems():
+                if var_name != 1:
+                    new_stride = coeff*stride
+                    old_stride = iname_to_stride.get(var_name, None)
+                    if old_stride is None or new_stride < old_stride:
+                        iname_to_stride[var_name] = new_stride
+
+        from pymbolic import evaluate
+        least_stride_iname, least_stride = argmin2((
+                (iname,
+                    evaluate(iname_to_stride[iname], approximate_arg_values))
+                for iname in iname_to_stride),
+                return_value=True)
+
+        if least_stride == 1:
+            vote_strength = 1
+        else:
+            vote_strength = 0.5
+
+        vote_count_for_l0[least_stride_iname] = (
+                vote_count_for_l0.get(least_stride_iname, 0)
+                + vote_strength)
+
+    return argmax2(vote_count_for_l0.iteritems())
+
+    # }}}
+
+
+
+
+
+def find_inadmissible_tag_keys(kernel, iname, iname_to_tag=None):
+    if iname_to_tag is None:
+        iname_to_tag = kernel.iname_to_tag
+
+    result = set()
+
+    from loopy.kernel import UniqueTag
+
+    for insn in kernel.instructions:
+        if iname in insn.all_inames():
+            for insn_iname in insn.all_inames():
+                if insn_iname == iname:
+                    continue
+
+                tag = iname_to_tag.get(insn_iname)
+                if isinstance(tag, UniqueTag):
+                    result.add(tag.key)
+
+    return result
+
+
+
+
+def assign_automatic_axes(kernel):
+    from loopy.kernel import (
+            TAG_AUTO_LOCAL_IDX, TAG_LOCAL_IDX)
+
+    new_iname_to_tag = kernel.iname_to_tag
+
+    # first assign each insn's axis 0, then the rest
+    for only_axis_0 in [True, False]:
+
+        for insn in kernel.instructions:
+            auto_axis_inames = [
+                    iname
+                    for iname in insn.all_inames()
+                    if isinstance(new_iname_to_tag.get(iname), TAG_AUTO_LOCAL_IDX)]
+
+            if not auto_axis_inames:
+                continue
+
+            local_assigned_axes = set()
+
+            for iname in insn.all_inames():
+                tag = new_iname_to_tag.get(iname)
+                if isinstance(tag, TAG_LOCAL_IDX):
+                    local_assigned_axes.add(tag.axis)
+
+            if 0 not in local_assigned_axes:
+                axis0_iname = guess_good_iname_for_axis_0(kernel, insn)
+
+                axis0_iname_tag = new_iname_to_tag.get(axis0_iname)
+                ax0_tag = TAG_LOCAL_IDX(0)
+                if (isinstance(axis0_iname_tag, TAG_AUTO_LOCAL_IDX)
+                        and ax0_tag.key not in find_inadmissible_tag_keys(
+                            kernel, axis0_iname, new_iname_to_tag)):
+                    new_iname_to_tag[axis0_iname] = ax0_tag
+                    local_assigned_axes.add(0)
+                    auto_axis_inames.remove(axis0_iname)
+
+            if only_axis_0:
+                continue
+
+            next_axis = 0
+            while auto_axis_inames:
+                iname = auto_axis_inames.pop()
+                while next_axis in local_assigned_axes:
+                    next_axis += 1
+
+                new_iname_to_tag[iname] = TAG_LOCAL_IDX(next_axis)
+                local_assigned_axes.add(next_axis)
+
+    return kernel.copy(iname_to_tag=new_iname_to_tag)
+
+
+
+
 def generate_loop_schedules_internal(kernel, schedule=[]):
     all_insn_ids = set(insn.id for insn in kernel.instructions)
 
@@ -391,21 +553,9 @@ def generate_loop_schedules(kernel):
 
     # }}}
 
-    kernel = fix_grid_sizes(kernel)
-
-    if 0:
-        loop_dep_graph = generate_loop_dep_graph(kernel)
-        for k, v in loop_dep_graph.iteritems():
-            print "%s: %s" % (k, ",".join(v))
-        1/0
-
     kernel = add_automatic_dependencies(kernel)
 
-    print kernel
-
-    #grid_size, group_size = find_known_grid_and_group_sizes(kernel)
-
-    #kernel = assign_grid_and_group_indices(kernel)
+    kernel = assign_automatic_axes(kernel)
 
     for gen_sched in generate_loop_schedules_internal(kernel):
         gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index e3e599e8b..cde52d38a 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -224,7 +224,7 @@ class CoefficientCollector(RecursiveMapper):
 
 # {{{ variable index expression collector
 
-class VariableIndexExpressionCollector(CombineMapper):
+class ArrayAccessFinder(CombineMapper):
     def __init__(self, tgt_vector_name=None):
         self.tgt_vector_name = tgt_vector_name
 
@@ -243,7 +243,7 @@ class VariableIndexExpressionCollector(CombineMapper):
         assert isinstance(expr.aggregate, Variable)
 
         if self.tgt_vector_name is None or expr.aggregate.name == self.tgt_vector_name:
-            return set([expr.index]) | self.rec(expr.index)
+            return set([expr]) | self.rec(expr.index)
         else:
             return CombineMapper.map_subscript(self, expr)
 
-- 
GitLab