Various minor tweaks

e9fee6d1 · Andreas Klöckner · 59474f3c · e9fee6d1 · e9fee6d1 · e9fee6d1
Commit e9fee6d1 authored 11 years ago by Andreas Klöckner
--- a/MEMO
+++ b/MEMO
@@ -16,13 +16,17 @@ Things to consider
 - Every loop in loopy is opened at most once.
  Too restrictive?

- Why do CSEs necessarily have to duplicate the inames?
+- Why do precomputes necessarily have to duplicate the inames?
  -> because that would be necessary for a sequential prefetch

 - Cannot do slab decomposition on inames that share a tag with
  other inames
  -> Is that reasonable?

+- Entering a loop means:
+  - setting up conditionals related to it (slabs/bounds)
+  - allowing loops nested inside to depend on loop state
+
 - Not using all hw loop dimensions causes an error, as
  is the case for variant 3 in the rank_one test.

@@ -53,7 +57,6 @@ To-do
 - rename IndexTag -> InameTag

 - Data implementation tags
-  - retag semantics once strides have been computed
  - turn base_indices into offset
  - vectorization
  - automatic copies
@@ -63,7 +66,6 @@ To-do
  - automatic copies from an array with one set of tags
    to the same array with another set.

-
 - Make tests run on GPUs

 Fixes:

--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -150,6 +150,11 @@ class CodeGenerationState(object):

        self.c_code_mapper = c_code_mapper

+    def copy(self, implemented_domain=None, c_code_mapper=None):
+        return CodeGenerationState(
+                implemented_domain=implemented_domain or self.implemented_domain,
+                c_code_mapper=c_code_mapper or self.c_code_mapper)
+
    def intersect(self, other):
        new_impl, new_other = isl.align_two(self.implemented_domain, other)
        return CodeGenerationState(
@@ -217,6 +222,8 @@ class POD(PODBase):
 # }}}


+# {{{ implemented data info
+
 class ImplementedDataInfo(Record):
    """
    .. attribute:: name
@@ -269,6 +276,8 @@ class ImplementedDataInfo(Record):
                stride_for_name_and_axis=stride_for_name_and_axis,
                allows_offset=allows_offset)

+# }}}
+

 # {{{ main code generation entrypoint


--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -54,7 +54,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
    else:
        result = result.compute_divs()

-    result = isl.align_spaces(result, implemented_domain)
+    result, implemented_domain = isl.align_two(result, implemented_domain)
    result = result.gist(implemented_domain)

    if overapproximate:

--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -199,6 +199,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
    from loopy.isl_helpers import static_value_of_pw_aff
    lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)
+
+    # These bounds are 'implemented' by the hardware. Make sure
+    # that the downstream conditional generators realize that.
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -748,7 +748,7 @@ class CompiledKernel:
        return kernel

    @memoize_method
-    def cl_kernel_info(self, arg_to_dtype_set, code_op=None):
+    def cl_kernel_info(self, arg_to_dtype_set=frozenset(), code_op=None):
        kernel = self.get_kernel(arg_to_dtype_set)

        from loopy.codegen import generate_code

--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -196,7 +196,6 @@ def loop_nest_map(kernel):
                # ILP tags are special because they are parallel tags
                # and therefore 'in principle' nest around everything.
                # But they're realized by the scheduler as a loop
-                # (and the scheduler is the only custom
                # at the innermost level, so we'll cut them some
                # slack here.
                continue

--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1127,8 +1127,11 @@ def test_vector_ilp_with_prefetch(ctx_factory):
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

-    code, info = lp.generate_code(knl)
+    cknl = lp.CompiledKernel(ctx, knl)
+    cknl.cl_kernel_info()
+
    import re
+    code = cknl.get_code()
    assert len(list(re.finditer("barrier", code))) == 1