From e9fee6d161abe517d1b6a6058ac468c4729df1e7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 19 Jun 2013 08:33:44 -0400
Subject: [PATCH] Various minor tweaks

---
 MEMO                      | 8 +++++---
 loopy/codegen/__init__.py | 9 +++++++++
 loopy/codegen/bounds.py   | 2 +-
 loopy/codegen/loop.py     | 3 +++
 loopy/compiled.py         | 2 +-
 loopy/schedule.py         | 1 -
 test/test_loopy.py        | 5 ++++-
 7 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/MEMO b/MEMO
index 8b9700092..bb3ab9410 100644
--- a/MEMO
+++ b/MEMO
@@ -16,13 +16,17 @@ Things to consider
 - Every loop in loopy is opened at most once.
   Too restrictive?
 
-- Why do CSEs necessarily have to duplicate the inames?
+- Why do precomputes necessarily have to duplicate the inames?
   -> because that would be necessary for a sequential prefetch
 
 - Cannot do slab decomposition on inames that share a tag with
   other inames
   -> Is that reasonable?
 
+- Entering a loop means:
+  - setting up conditionals related to it (slabs/bounds)
+  - allowing loops nested inside to depend on loop state
+
 - Not using all hw loop dimensions causes an error, as
   is the case for variant 3 in the rank_one test.
 
@@ -53,7 +57,6 @@ To-do
 - rename IndexTag -> InameTag
 
 - Data implementation tags
-  - retag semantics once strides have been computed
   - turn base_indices into offset
   - vectorization
   - automatic copies
@@ -63,7 +66,6 @@ To-do
   - automatic copies from an array with one set of tags
     to the same array with another set.
 
-
 - Make tests run on GPUs
 
 Fixes:
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 5ebce957c..5201d9be1 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -150,6 +150,11 @@ class CodeGenerationState(object):
 
         self.c_code_mapper = c_code_mapper
 
+    def copy(self, implemented_domain=None, c_code_mapper=None):
+        return CodeGenerationState(
+                implemented_domain=implemented_domain or self.implemented_domain,
+                c_code_mapper=c_code_mapper or self.c_code_mapper)
+
     def intersect(self, other):
         new_impl, new_other = isl.align_two(self.implemented_domain, other)
         return CodeGenerationState(
@@ -217,6 +222,8 @@ class POD(PODBase):
 # }}}
 
 
+# {{{ implemented data info
+
 class ImplementedDataInfo(Record):
     """
     .. attribute:: name
@@ -269,6 +276,8 @@ class ImplementedDataInfo(Record):
                 stride_for_name_and_axis=stride_for_name_and_axis,
                 allows_offset=allows_offset)
 
+# }}}
+
 
 # {{{ main code generation entrypoint
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 17641ec01..3beb9a2ee 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -54,7 +54,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
     else:
         result = result.compute_divs()
 
-    result = isl.align_spaces(result, implemented_domain)
+    result, implemented_domain = isl.align_two(result, implemented_domain)
     result = result.gist(implemented_domain)
 
     if overapproximate:
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index a20ec818b..bfc0273c0 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -199,6 +199,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
     from loopy.isl_helpers import static_value_of_pw_aff
     lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff,
             constants_only=False)
+
+    # These bounds are 'implemented' by the hardware. Make sure
+    # that the downstream conditional generators realize that.
     slab = make_slab(domain.get_space(), iname,
             lower_bound, lower_bound+hw_axis_size)
     codegen_state = codegen_state.intersect(slab)
diff --git a/loopy/compiled.py b/loopy/compiled.py
index d799186f5..ae443679b 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -748,7 +748,7 @@ class CompiledKernel:
         return kernel
 
     @memoize_method
-    def cl_kernel_info(self, arg_to_dtype_set, code_op=None):
+    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), code_op=None):
         kernel = self.get_kernel(arg_to_dtype_set)
 
         from loopy.codegen import generate_code
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 52d001ed4..3bcf975d9 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -196,7 +196,6 @@ def loop_nest_map(kernel):
                 # ILP tags are special because they are parallel tags
                 # and therefore 'in principle' nest around everything.
                 # But they're realized by the scheduler as a loop
-                # (and the scheduler is the only custom
                 # at the innermost level, so we'll cut them some
                 # slack here.
                 continue
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 272b0130b..01ea6b6fd 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1127,8 +1127,11 @@ def test_vector_ilp_with_prefetch(ctx_factory):
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
 
-    code, info = lp.generate_code(knl)
+    cknl = lp.CompiledKernel(ctx, knl)
+    cknl.cl_kernel_info()
+
     import re
+    code = cknl.get_code()
     assert len(list(re.finditer("barrier", code))) == 1
 
 
-- 
GitLab