diff --git a/MEMO b/MEMO index 06f585339489c77d74d649ea47d1c92ab924ed66..f29981a086d3ba89cdeb45b09cf1f4761e451e5c 100644 --- a/MEMO +++ b/MEMO @@ -45,8 +45,6 @@ To-do - Kernel splitting (via what variables get computed in a kernel) -- Test join_inames - - Make tests run on GPUs - Streamline arg @@ -59,6 +57,8 @@ To-do - syntax for linear array access +- Fuse: store/fetch elimination? + Fixes: - Group instructions by dependency/inames for scheduling, to @@ -72,10 +72,13 @@ Fixes: Future ideas ^^^^^^^^^^^^ +<<<<<<< HEAD - Put all OpenCL functions into mangler - Fuse: store/fetch elimination? +======= +>>>>>>> d0f46221e2249d7894aed2d5e7ab21e84c419eac - Expose iname-duplicate-and-rename as a primitive. - Array language @@ -121,6 +124,10 @@ Future ideas Dealt with ^^^^^^^^^^ +- Test divisibility constraints + +- Test join_inames + - Divisibility, modulo, strides? -> Tested, gives correct (but suboptimal) code. diff --git a/loopy/__init__.py b/loopy/__init__.py index 219cc50d84476c954f241069a2027423629e399b..c186a774e578929de88d4f381c30a691fdea39b5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -193,12 +193,19 @@ def join_inames(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): if new_iname is None: new_iname = kernel.make_unique_var_name("_and_".join(inames)) - new_domain = kernel.domain + from loopy.kernel import DomainChanger + domch = DomainChanger(kernel, frozenset(inames)) + for iname in inames: + if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: + raise RuntimeError("iname '%s' is not 'at home' in the " + "join's leaf domain" % iname) + + new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) - joint_aff = zero = isl.Aff.zero_on_domain(kernel.space) + joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 @@ -253,7 +260,7 @@ def join_inames(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): else: result.add(iname) - return result + return frozenset(result) new_insns = [ insn.copy( @@ -265,7 +272,8 @@ def join_inames(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): result = (kernel .map_expressions(subst_map, exclude_instructions=True) .copy( - instructions=new_insns, domain=new_domain, + instructions=new_insns, + domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_map] )) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index db1c4f4a0007f6d1e6c7c1e6cabab9c6e28112ec..d1c644c639d144bc68589b5d6f0ef3a35d9610b4 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -107,7 +107,7 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames): if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: multi_use_inames.add(iname) - return cond_inames - multi_use_inames + return frozenset(cond_inames - multi_use_inames) @@ -224,9 +224,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): only_unshared_inames = remove_inames_for_shared_hw_axes(kernel, current_iname_set & used_inames) - bounds_checks = bounds_check_cache( - frozenset(remove_inames_for_shared_hw_axes(kernel, - only_unshared_inames))) + bounds_checks = bounds_check_cache(only_unshared_inames) if bounds_checks or candidate_group_length == 1: # length-1 must always be an option to reach the recursion base case below diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 650ce34f4562ae36ba4e9ac2342cb57b690d3d75..9c5159111eae7ced3d3dd662df9adc8b897c69f1 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -61,11 +61,11 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): .add_constraint( isl.Constraint.inequality_from_aff( iname_rel_aff(space, - iname, ">=", upper_bound_aff-upper_incr)))) + iname, ">", upper_bound_aff-upper_incr)))) upper_bulk_bound = ( isl.Constraint.inequality_from_aff( iname_rel_aff(space, - iname, "<", upper_bound_aff-upper_incr))) + iname, "<=", upper_bound_aff-upper_incr))) else: lower_slab = None @@ -123,7 +123,9 @@ def intersect_kernel_with_slab(kernel, slab, iname): home_domain = kernel.domains[hdi] new_domains = kernel.domains[:] new_domains[hdi] = home_domain & isl.align_spaces(slab, home_domain) - return kernel.copy(domains=new_domains) + + return kernel.copy(domains=new_domains, + get_grid_sizes=kernel.get_grid_sizes) # {{{ hw-parallel loop @@ -222,15 +224,6 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) - # move inames that are usable into parameters - for iname in domain.get_var_names(dim_type.set): - if iname in usable_inames: - dt, idx = domain.get_var_dict()[iname] - domain = domain.move_dims( - dim_type.param, domain.dim(dim_type.param), - dt, idx, 1) - - result = [] for slab_name, slab in slabs: @@ -243,7 +236,18 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = domain & slab - _, loop_iname_idx = domain.get_var_dict()[loop_iname] + + # move inames that are usable into parameters + moved_inames = [] + for iname in dom_and_slab.get_var_names(dim_type.set): + if iname in usable_inames: + moved_inames.append(iname) + dt, idx = dom_and_slab.get_var_dict()[iname] + dom_and_slab = dom_and_slab.move_dims( + dim_type.param, dom_and_slab.dim(dim_type.param), + dt, idx, 1) + + _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] lbound = kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).coalesce() ubound = kernel.cache_manager.dim_max( @@ -264,16 +268,22 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from loopy.isl_helpers import iname_rel_aff impl_slab = ( - isl.BasicSet.universe(domain.space) + isl.BasicSet.universe(dom_and_slab.space) .add_constraint( isl.Constraint.inequality_from_aff( - iname_rel_aff(domain.space, + iname_rel_aff(dom_and_slab.space, loop_iname, ">=", lbound))) .add_constraint( isl.Constraint.inequality_from_aff( - iname_rel_aff(domain.space, + iname_rel_aff(dom_and_slab.space, loop_iname, "<=", ubound)))) + for iname in moved_inames: + dt, idx = impl_slab.get_var_dict()[iname] + impl_slab = impl_slab.move_dims( + dim_type.set, impl_slab.dim(dim_type.set), + dt, idx, 1) + new_codegen_state = codegen_state.intersect(impl_slab) inner = build_loop_nest(kernel, sched_index+1, diff --git a/loopy/compiled.py b/loopy/compiled.py index c8b733d3d5e460618807999ee6c64f6f36309d5e..c8337f8b705b226a35fb026f896d60a53f0b548a 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -475,7 +475,7 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet print_ref_code=False, print_code=True, warmup_rounds=2, edit_code=False, dump_binary=False, codegen_kwargs={}, options=[], - fills_entire_output=True, check_result=None): + fills_entire_output=True, do_check=True, check_result=None): """Compare results of `ref_knl` to the kernels generated by the generator `kernel_gen`. @@ -505,7 +505,7 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet fill_value_ref = -17 fill_value = fill_value_ref - # {{{ set up CL context for reference run + # {{{ find candidate devices for reference run all_devs = [] cpu_devs = [] @@ -548,6 +548,17 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet ref_sched_kernel = knl break + try: + ref_args, arg_descriptors = \ + make_ref_args(ref_sched_kernel, ref_queue, parameters, + fill_value=fill_value_ref) + except cl.RuntimeError, e: + if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: + continue + + if not do_check: + break + ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel, options=options, codegen_kwargs=codegen_kwargs) @@ -558,13 +569,6 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet print get_highlighted_code(ref_compiled.code) print 75*"-" - try: - ref_args, arg_descriptors = \ - make_ref_args(ref_sched_kernel, ref_queue, parameters, - fill_value=fill_value_ref) - except cl.RuntimeError, e: - if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: - continue ref_queue.finish() ref_start = time() @@ -586,6 +590,8 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet # {{{ compile and run parallel code + need_check = do_check + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) @@ -610,12 +616,10 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet print compiled.cl_program.binaries[0] print 75*"-" - do_check = True - for i in range(warmup_rounds): evt, _ = compiled(queue, **args) - if do_check: + if need_check: for arg_desc in arg_descriptors: if arg_desc is None: continue @@ -637,7 +641,7 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet error_is_small, error = check_result(test_ary, ref_ary) assert error_is_small, error - do_check = False + need_check = False events = [] queue.finish() @@ -687,11 +691,12 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet print "elapsed: %g s event, %s s other-event %g s wall (%d rounds)%s" % ( elapsed, elapsed_evt_2, elapsed_wall, timing_rounds, rates) - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl) - print "ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed, ref_elapsed_wall, ref_rates) + if do_check: + ref_rates = "" + for cnt, lbl in zip(op_count, op_label): + ref_rates += " %g %s/s" % (cnt/ref_elapsed, lbl) + print "ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed, ref_elapsed_wall, ref_rates) # }}} diff --git a/loopy/cse.py b/loopy/cse.py index 809748b0bc1739b8b19e5259de0555b379ce1f43..c468c7f9d3a5f509696a42ddb02e00bea2d3217b 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -160,13 +160,22 @@ def build_global_storage_to_sweep_map(kernel, invocation_descriptors, for invdesc in invocation_descriptors: if not invdesc.expands_footprint: - arg_inames = set() + arg_inames = ( + set(global_s2s_par_dom.get_var_names(dim_type.param)) + & kernel.all_inames()) for arg in invdesc.args: arg_inames.update(get_dependencies(arg)) arg_inames = frozenset(arg_inames) - usage_domain = kernel.get_inames_domain(arg_inames) + from loopy.kernel import CannotBranchDomainTree + try: + usage_domain = kernel.get_inames_domain(arg_inames) + except CannotBranchDomainTree: + # and that's the end of that. + invdesc.is_in_footprint = False + continue + for i in xrange(usage_domain.dim(dim_type.set)): iname = usage_domain.get_dim_name(dim_type.set, i) if iname in sweep_inames: diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index b67411267ba8d8d41ee9629714b4c60009fe3ae0..57ca2e769e4d345a01ac94e94691f3a1e18df66e 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -170,6 +170,11 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) if len(pieces) == 1: return pieces[0][1] + # put constant bounds first + pieces = ( + [(set, aff) for set, aff in pieces if aff.is_cst()] + + [(set, aff) for set, aff in pieces if not aff.is_cst()]) + reference = pw_aff.get_aggregate_domain() if context is not None: diff --git a/loopy/kernel.py b/loopy/kernel.py index a3f2cbee1489bcffd501c16f330bd7cf649e73c3..835f5abf9a0367b354a0acccfcf7dc6420000365 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -12,6 +12,8 @@ import re +class CannotBranchDomainTree(RuntimeError): + pass # {{{ index tags @@ -727,7 +729,12 @@ class LoopKernel(Record): cache_manager=None, iname_to_tag_requests=None, index_dtype=np.int32, - isl_context=None): + isl_context=None, + + # When kernels get intersected in slab decomposition, + # their grid sizes shouldn't change. This provides + # a way to forward sub-kernel grid size requests. + get_grid_sizes=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" @@ -1010,6 +1017,10 @@ class LoopKernel(Record): if np.iinfo(index_dtype).min >= 0: raise TypeError("index_dtype must be signed") + if get_grid_sizes is not None: + # overwrites method down below + self.get_grid_sizes = get_grid_sizes + Record.__init__(self, device=device, domains=domains, instructions=parsed_instructions, @@ -1284,7 +1295,7 @@ class LoopKernel(Record): all_parents = set(ppd[home_domain_index]) if not domain_indices <= all_parents: - raise RuntimeError("iname set '%s' requires " + raise CannotBranchDomainTree("iname set '%s' requires " "branch in domain tree (when adding '%s')" % (", ".join(inames), iname)) @@ -1344,20 +1355,6 @@ class LoopKernel(Record): return result - # }}} - - # {{{ examine domains - - for i_dom, dom in enumerate(self.domains): - for iname in dom.get_var_names(dim_type.set): - for par_iname in dom.get_var_names(dim_type.param): - if par_iname in all_inames: - result[iname].add(par_iname) - - # }}} - - return result - # }}} # {{{ read and written variables diff --git a/test/test_loopy.py b/test/test_loopy.py index c3029091538ed94ff2558e1080fab8c3b0467792..e748e7cc9f7187be73e11dcfe2b05e4e4ca0ca59 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -61,6 +61,66 @@ def test_wg_too_small(ctx_factory): +def test_join_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + "{[i,j]: 0<=i,j<16}", + [ + "b[i,j] = 2*a[i,j]" + ], + [ + lp.GlobalArg("a", np.float32, shape=(16, 16,)), + lp.GlobalArg("b", np.float32, shape=(16, 16,)) + ], + ) + + ref_knl = knl + + knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) + knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen) + + lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen) + + + + + +def test_divisibility_assumption(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + "[n] -> {[i]: 0<=i