diff --git a/MEMO b/MEMO index b03133efe6e622f6fb09d0cea043d2e25469fc6d..cd91b7754796aa6dd81eb4036b56184ea5bfd1a5 100644 --- a/MEMO +++ b/MEMO @@ -47,9 +47,6 @@ To-do increase sched. scalability - Multi-domain - - Incorporate loop-bound-mediated iname dependencies into domain - parenthood. - - Reenable codegen sanity check. - Kernel splitting (via what variables get computed in a kernel) @@ -65,7 +62,6 @@ To-do - Scalar insn priority - - If finding a maximum proves troublesome, move parameters into the domain - : (as in, Matlab full-slice) in prefetches @@ -126,6 +122,9 @@ Dealt with -> dealt with by type contexts - relating to Multi-Domain + - Incorporate loop-bound-mediated iname dependencies into domain + parenthood. [DONE] + - Make sure that variables that enter into loop bounds are only written exactly once. [DONE] diff --git a/loopy/check.py b/loopy/check.py index 148595636643ff0f78264539ab34d2e3843a1d7f..17df483b4d7bf0ef47870cd971631fb51e8127bb 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1,4 +1,5 @@ from __future__ import division +from islpy import dim_type @@ -85,7 +86,7 @@ def check_for_inactive_iname_access(kernel): def check_for_write_races(kernel): from loopy.symbolic import DependencyMapper - from loopy.kernel import ParallelTag, GroupIndexTag, IlpBaseTag + from loopy.kernel import ParallelTag, GroupIndexTag depmap = DependencyMapper() for insn in kernel.instructions: @@ -173,6 +174,26 @@ def check_for_orphaned_user_hardware_axes(kernel): raise RuntimeError("user-requested local hardware axis %d " "has no iname mapped to it" % axis) +def check_for_data_dependent_parallel_bounds(kernel): + from loopy.kernel import ParallelTag + + for i, dom in enumerate(kernel.domains): + dom_inames = set(dom.get_var_names(dim_type.set)) + par_inames = set(iname + for iname in dom_inames + if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + + if not par_inames: + continue + + parameters = set(dom.get_var_names(dim_type.param)) + for par in parameters: + if par in kernel.temporary_variables: + raise RuntimeError("Domain number %d has a data-dependent " + "parameter '%s' and contains parallel " + "inames '%s'. This is not allowed (for now)." + % (i, par, ", ".join(par_inames))) + # }}} def run_automatic_checks(kernel): @@ -181,7 +202,7 @@ def run_automatic_checks(kernel): check_for_unused_hw_axes_in_insns(kernel) check_for_inactive_iname_access(kernel) check_for_write_races(kernel) - + check_for_data_dependent_parallel_bounds(kernel) # {{{ sanity-check for implemented domains of each instruction diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 1ec320bef890d95490bd7bbc57bf0d11478d0ec9..c9a9b866070c544c0a7260fe5dd8f2dc8d5f8ac7 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -104,7 +104,7 @@ def constraint_to_code(ccm, cns): comp_op = ">=" from loopy.symbolic import constraint_to_expr - return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op) + return "%s %s 0" % (ccm(constraint_to_expr(cns), 'i'), comp_op) def filter_necessary_constraints(implemented_domain, constraints): return [cns @@ -131,8 +131,10 @@ def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): domain, check_inames, implemented_domain) - new_implemented_domain = implemented_domain & ( - isl.Set.universe(domain.get_space()).add_constraints(bounds_checks)) + bounds_check_set = isl.Set.universe(domain.get_space()).add_constraints(bounds_checks) + bounds_check_set, new_implemented_domain = isl.align_two( + bounds_check_set, implemented_domain) + new_implemented_domain = new_implemented_domain & bounds_check_set condition_codelets = [ constraint_to_code(ccm, cns) for cns in @@ -190,7 +192,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): from cgen import Initializer, POD, Const, Line return gen_code_block([ Initializer(Const(POD(np.int32, iname)), - ccm(equality_expr)), + ccm(equality_expr, 'i')), Line(), stmt, ]) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 1bd89ed690819c0d75d48f925f919ca62f9d9f3a..79c86a18114cb61782559a098883f27762e7baaf 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -120,7 +120,9 @@ def build_loop_nest(kernel, sched_index, codegen_state): from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, gather_schedule_subloop) - # {{{ pass 1: pre-scan schedule for my schedule items' indices + # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices + + # i.e. go up to the next LeaveLoop, and skip over inner loops. my_sched_indices = [] @@ -146,7 +148,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # }}} - # {{{ pass 2: find admissible conditional inames for each schedule item + # {{{ pass 2: find admissible conditional inames for each sibling schedule item admissible_cond_inames = [ get_admissible_conditional_inames_for(kernel, sched_index) @@ -232,14 +234,21 @@ def build_loop_nest(kernel, sched_index, codegen_state): # pick largest such group group_length, bounds_checks = max(found_hoists) - if bounds_checks: - check_set = isl.BasicSet.universe(kernel.space) - for cns in bounds_checks: - check_set = check_set.add_constraint(cns) + check_set = None + for cns in bounds_checks: + cns_set = (isl.BasicSet.universe(cns.get_space()) + .add_constraint(cns)) - new_codegen_state = codegen_state.intersect(check_set) - else: + if check_set is None: + check_set = cns_set + else: + check_set, cns_set = isl.align_two(check_set, cns_set) + check_set = check_set.intersect(cns_set) + + if check_set is None: new_codegen_state = codegen_state + else: + new_codegen_state = codegen_state.intersect(check_set) if group_length == 1: # group only contains starting schedule item diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index d37a720ca084ae9bc38cde7d1edb753b81f6fdbd..36a24d5c50382443701b9a83c0f7b9ede13afea9 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -223,9 +223,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= if len(slabs) == 1: cmt = None - new_kernel = kernel.copy(domain=domain & slab) + new_codegen_state = codegen_state.intersect(slab) inner = set_up_hw_parallel_loops( - new_kernel, sched_index, codegen_state, hw_inames_left) + kernel, sched_index, new_codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block diff --git a/loopy/kernel.py b/loopy/kernel.py index 36575b1033f65ed8b8c91f0e74e8076c4c2cf78a..0c9714001fc52e68da9fd58fbf2730f72188cf2c 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -848,8 +848,14 @@ class LoopKernel(Record): # {{{ process assumptions if assumptions is None: - assumptions_space = domains[0].get_space() + dom0_space = domains[0].get_space() + assumptions_space = isl.Space.params_alloc( + dom0_space.get_ctx(), dom0_space.dim(dim_type.param)) + for i in xrange(dom0_space.dim(dim_type.param)): + assumptions_space = assumptions_space.set_dim_name( + dim_type.param, i, dom0_space.get_dim_name(dim_type.param, i)) assumptions = isl.Set.universe(assumptions_space) + elif isinstance(assumptions, str): all_inames = set() all_params = set() @@ -865,6 +871,8 @@ class LoopKernel(Record): assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(), assumptions_set_str) + assert assumptions.is_params() + # }}} Record.__init__(self, @@ -955,6 +963,8 @@ class LoopKernel(Record): iname_set_stack = [] result = [] + writer_map = self.writer_map() + for dom in self.domains: parameters = set(dom.get_var_names(dim_type.param)) inames = set(dom.get_var_names(dim_type.set)) @@ -966,12 +976,38 @@ class LoopKernel(Record): discard_level_count = 0 while discard_level_count < len(iname_set_stack): - last_inames = iname_set_stack[-1-discard_level_count] + # {{{ check for parenthood by loop bound iname + last_inames = iname_set_stack[-1-discard_level_count] if last_inames & parameters: break - else: - discard_level_count += 1 + + # }}} + + # {{{ check for parenthood by written variable + + is_parent_by_variable = False + for par in parameters: + if par in self.temporary_variables: + writer_insns = writer_map[par] + + if len(writer_insns) > 1: + raise RuntimeError("loop bound '%s' " + "may only be written to once" % par) + + writer_insn, = writer_insns + writer_inames = self.insn_inames(writer_insn) + + if writer_inames & last_inames: + is_parent_by_variable = True + break + + if is_parent_by_variable: + break + + # }}} + + discard_level_count += 1 if discard_level_count: iname_set_stack = iname_set_stack[:-discard_level_count] @@ -1234,9 +1270,10 @@ class LoopKernel(Record): domain = self.get_inames_domain(frozenset([iname])) d_var_dict = domain.get_var_dict() - dom_intersect_assumptions = ( - isl.align_spaces(self.assumptions, domain, obj_bigger_ok=True) + dom_intersect_assumptions = (isl.align_spaces( + self.assumptions, domain, obj_bigger_ok=True) & domain) + lower_bound_pw_aff = ( self.cache_manager.dim_min( dom_intersect_assumptions, @@ -1252,7 +1289,7 @@ class LoopKernel(Record): pass size = (upper_bound_pw_aff - lower_bound_pw_aff + 1) - size = size.intersect_domain(self.assumptions) + size = size.gist(self.assumptions) return BoundsRecord( lower_bound_pw_aff=lower_bound_pw_aff, diff --git a/test/test_loopy.py b/test/test_loopy.py index 37d07f04e00469dbd1bc78a8f3d03f0fea61176a..17ef72f9a929a1b31b3c8dba0614fbbee2238928 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -399,6 +399,11 @@ def test_dependent_loop_bounds_2(ctx_factory): def test_dependent_loop_bounds_3(ctx_factory): + # The point of this test is that it shows a dependency between + # domains that is exclusively mediated by the row_len temporary. + # It also makes sure that row_len gets read before any + # conditionals use it. + dtype = np.dtype(np.float32) ctx = ctx_factory() @@ -409,7 +414,7 @@ def test_dependent_loop_bounds_3(ctx_factory): ], [ "<> row_len = a_row_lengths[i]", - "a[i,j] = 1", + "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32), @@ -417,15 +422,87 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.ScalarArg("n", np.int32), ]) + assert knl.parents_per_domain()[1] == 0 + knl = lp.split_dimension(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_dimension(knl, "j", 128, outer_tag="g.1", - inner_tag="l.1") + cknl = lp.CompiledKernel(ctx, knl) print "---------------------------------------------------" cknl.print_code() print "---------------------------------------------------" + knl_bad = lp.split_dimension(knl, "jj", 128, outer_tag="g.1", + inner_tag="l.1") + + import pytest + with pytest.raises(RuntimeError): + list(lp.generate_loop_schedules(knl_bad)) + + + + +def test_independent_multi_domains(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i<n}", + "{[j]: 0<=j<n}", + ], + [ + "a[i,j] = 1", + ], + [ + lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), + lp.ScalarArg("n", np.int32), + ]) + + + knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", + inner_tag="l.0") + knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", + inner_tag="l.1") + assert knl.parents_per_domain() == 2*[None] + + n = 50 + cknl = lp.CompiledKernel(ctx, knl) + evt, (a,) = cknl(queue, n=n, out_host=True) + + assert a.shape == (50, 50) + assert (a == 1).all() + + + + + +def test_bare_data_dependency(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel(ctx.devices[0], + [ + "[znirp] -> {[i]: 0<=i<znirp}", + ], + [ + "<> znirp = n", + "a[i] = 1", + ], + [ + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.ScalarArg("n", np.int32), + ]) + + cknl = lp.CompiledKernel(ctx, knl) + n = 20000 + evt, (a,) = cknl(queue, n=n, out_host=True) + + assert a.shape == (n,) + assert (a == 1).all() +