From e9fee6d161abe517d1b6a6058ac468c4729df1e7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Wed, 19 Jun 2013 08:33:44 -0400 Subject: [PATCH] Various minor tweaks --- MEMO | 8 +++++--- loopy/codegen/__init__.py | 9 +++++++++ loopy/codegen/bounds.py | 2 +- loopy/codegen/loop.py | 3 +++ loopy/compiled.py | 2 +- loopy/schedule.py | 1 - test/test_loopy.py | 5 ++++- 7 files changed, 23 insertions(+), 7 deletions(-) diff --git a/MEMO b/MEMO index 8b9700092..bb3ab9410 100644 --- a/MEMO +++ b/MEMO @@ -16,13 +16,17 @@ Things to consider - Every loop in loopy is opened at most once. Too restrictive? -- Why do CSEs necessarily have to duplicate the inames? +- Why do precomputes necessarily have to duplicate the inames? -> because that would be necessary for a sequential prefetch - Cannot do slab decomposition on inames that share a tag with other inames -> Is that reasonable? +- Entering a loop means: + - setting up conditionals related to it (slabs/bounds) + - allowing loops nested inside to depend on loop state + - Not using all hw loop dimensions causes an error, as is the case for variant 3 in the rank_one test. @@ -53,7 +57,6 @@ To-do - rename IndexTag -> InameTag - Data implementation tags - - retag semantics once strides have been computed - turn base_indices into offset - vectorization - automatic copies @@ -63,7 +66,6 @@ To-do - automatic copies from an array with one set of tags to the same array with another set. - - Make tests run on GPUs Fixes: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 5ebce957c..5201d9be1 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -150,6 +150,11 @@ class CodeGenerationState(object): self.c_code_mapper = c_code_mapper + def copy(self, implemented_domain=None, c_code_mapper=None): + return CodeGenerationState( + implemented_domain=implemented_domain or self.implemented_domain, + c_code_mapper=c_code_mapper or self.c_code_mapper) + def intersect(self, other): new_impl, new_other = isl.align_two(self.implemented_domain, other) return CodeGenerationState( @@ -217,6 +222,8 @@ class POD(PODBase): # }}} +# {{{ implemented data info + class ImplementedDataInfo(Record): """ .. attribute:: name @@ -269,6 +276,8 @@ class ImplementedDataInfo(Record): stride_for_name_and_axis=stride_for_name_and_axis, allows_offset=allows_offset) +# }}} + # {{{ main code generation entrypoint diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 17641ec01..3beb9a2ee 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -54,7 +54,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain, else: result = result.compute_divs() - result = isl.align_spaces(result, implemented_domain) + result, implemented_domain = isl.align_two(result, implemented_domain) result = result.gist(implemented_domain) if overapproximate: diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a20ec818b..bfc0273c0 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -199,6 +199,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, from loopy.isl_helpers import static_value_of_pw_aff lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) + + # These bounds are 'implemented' by the hardware. Make sure + # that the downstream conditional generators realize that. slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) diff --git a/loopy/compiled.py b/loopy/compiled.py index d799186f5..ae443679b 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -748,7 +748,7 @@ class CompiledKernel: return kernel @memoize_method - def cl_kernel_info(self, arg_to_dtype_set, code_op=None): + def cl_kernel_info(self, arg_to_dtype_set=frozenset(), code_op=None): kernel = self.get_kernel(arg_to_dtype_set) from loopy.codegen import generate_code diff --git a/loopy/schedule.py b/loopy/schedule.py index 52d001ed4..3bcf975d9 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -196,7 +196,6 @@ def loop_nest_map(kernel): # ILP tags are special because they are parallel tags # and therefore 'in principle' nest around everything. # But they're realized by the scheduler as a loop - # (and the scheduler is the only custom # at the innermost level, so we'll cut them some # slack here. continue diff --git a/test/test_loopy.py b/test/test_loopy.py index 272b0130b..01ea6b6fd 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1127,8 +1127,11 @@ def test_vector_ilp_with_prefetch(ctx_factory): knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - code, info = lp.generate_code(knl) + cknl = lp.CompiledKernel(ctx, knl) + cknl.cl_kernel_info() + import re + code = cknl.get_code() assert len(list(re.finditer("barrier", code))) == 1 -- GitLab