diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 8b2a8190cedb65c034e95a8f4cd6b0c9a2bd65e3..6064f5a2b0c7726ce01d54eb2efe99fec07c1f8e 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1053,7 +1053,7 @@ class DependencyRecord(Record): def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): - """If there exists a depdency between target and source and the two access + """If there exists a dependency between target and source and the two access a common variable of *var_kind* in a way that requires a barrier (essentially, at least one write), then the function will return a tuple ``(target, source, var_name)``. Otherwise, it will return *None*. diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 77a07f000869a4e7643b9c5214ddac9d7836f439..4bbf99d57145a5c3518262783eae862bf0d6a737 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -87,11 +87,22 @@ def get_common_hw_inames(kernel, insn_ids): # Get the list of hardware inames in which the temporary is defined. if len(insn_ids) == 0: return set() - id_to_insn = kernel.id_to_insn - from six.moves import reduce - return reduce( - set.intersection, - (get_hw_inames(kernel, id_to_insn[id]) for id in insn_ids)) + return set.intersection( + *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids)) + + +def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): + from loopy.kernel.data import HardwareParallelTag + new_schedule = [] + + for item in kernel.schedule: + if isinstance(item, (EnterLoop, LeaveLoop)): + tag = kernel.iname_to_tag.get(item.iname) + if isinstance(tag, HardwareParallelTag): + continue + new_schedule.append(item) + + return kernel.copy(schedule=new_schedule) # }}} @@ -260,7 +271,7 @@ def compute_live_temporaries(kernel, schedule): live_in[idx] = live_out[idx] = live_in[idx + 1] idx -= 1 else: - raise LoopyError("unexepcted type of schedule item: %s" + raise LoopyError("unexpected type of schedule item: %s" % type(sched_item).__name__) # }}} @@ -325,6 +336,10 @@ class PromotedTemporary(Record): def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ + For each temporary in the passed list of temporaries, construct a + :class:`PromotedTemporary` which describes how the temporary should + get promoted into global storage. + :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ @@ -343,6 +358,18 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen): assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" + # `hw_inames`: The set of hw-parallel tagged inames that this temporary + # is associated with. This is used for determining the shape of the + # global storage needed for saving and restoring the temporary across + # kernel calls. + # + # TODO: Make a policy decision about which dimensions to use. Currently, + # the code looks at each instruction that defines or uses the temporary, + # and takes the common set of hw-parallel tagged inames associated with + # these instructions. + # + # Furthermore, in the case of local temporaries, inames that are tagged + # hw-local do not contribute to the global storage shape. hw_inames = get_common_hw_inames(kernel, def_lists[temporary.name] + use_lists[temporary.name]) @@ -350,6 +377,8 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen): hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) + # Calculate the sizes of the dimensions that get added in front for + # the global storage of the temporary. shape_prefix = [] backing_hw_inames = [] @@ -403,11 +432,15 @@ def augment_domain_for_temporary_promotion( new_iname = name_gen("{name}_{mode}_dim_{dim}". format(name=orig_temporary.name, mode=mode, - dim=orig_dim + t_idx)) + dim=t_idx)) domain = domain.set_dim_name( isl.dim_type.set, orig_dim + t_idx, new_iname) - #from loopy.kernel.data import auto - #iname_to_tag[new_iname] = auto + if orig_temporary.is_local: + # If the temporary is has local scope, then loads / stores can be + # done in parallel. + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() + dim_inames.append(new_iname) # Add size information. @@ -415,7 +448,7 @@ def augment_domain_for_temporary_promotion( domain &= aff[0].le_set(aff[new_iname]) size = orig_temporary.shape[t_idx] from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size)) + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) hw_inames = [] @@ -530,7 +563,7 @@ def restore_and_save_temporaries(kernel): for tval in tvals: from loopy.kernel.tools import DomainChanger tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(kernel, + dchg = DomainChanger(new_kernel, frozenset(sched_item.extra_inames + tval_hw_inames)) domain = dchg.domain @@ -564,7 +597,9 @@ def restore_and_save_temporaries(kernel): args = reversed(args) from loopy.kernel.data import Assignment - new_insn = Assignment(*args, id=insn_id) + new_insn = Assignment(*args, id=insn_id, + forced_iname_deps=frozenset(hw_inames + dim_inames), + forced_iname_deps_is_final=True) new_instructions.append(new_insn) @@ -612,17 +647,25 @@ def restore_and_save_temporaries(kernel): # }}} new_iname_to_tag.update(kernel.iname_to_tag) - new_temporary_variables = dict( + updated_temporary_variables = dict( (t.name, t.as_variable()) for t in new_temporaries.values()) - new_temporary_variables.update(kernel.temporary_variables) + updated_temporary_variables.update(kernel.temporary_variables) kernel = kernel.copy( iname_to_tag=new_iname_to_tag, - temporary_variables=new_temporary_variables, + temporary_variables=updated_temporary_variables, instructions=kernel.instructions + new_instructions, schedule=new_schedule ) + from loopy.kernel.tools import assign_automatic_axes + kernel = assign_automatic_axes(kernel) + + # Once assign_automatic_axes() does its job, loops in the schedule + # for newly hardware-tagged inames are no longer necessary (and in + # fact illegal), so remove them. + kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) + return kernel @@ -712,7 +755,7 @@ def map_schedule_onto_host_or_device_impl(kernel): current_chunk.append(sched_item) i += 1 else: - raise LoopyError("unexepcted type of schedule item: %s" + raise LoopyError("unexpected type of schedule item: %s" % type(sched_item).__name__) if current_chunk and schedule_required_splitting: diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b570cf4ab7e4ed543213b73716b6271e61e6b69b..428f333f660c1d0965a50f0af1d7c2cd907395a2 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -615,7 +615,9 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from genpy import Assign, Comment, Line def alloc_nbytes(tv): - return tv.dtype.numpy_dtype.itemsize + from six.moves import reduce + from operator import mul + return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from loopy.kernel.data import temp_var_scope diff --git a/test/test_loopy.py b/test/test_loopy.py index 26f4f2faf3b83a24932cad0ac3941696ed3e2636..5a2243fe6ebe0af2019cfbf7f74657de6d8c9908 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1104,13 +1104,16 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): knl = lp.make_kernel( "{ [i,k]: 0<=i t_private = a[k,i+1] + <> t_private_scalar = a[k,i+1] + <> t_private_array[i % 2] = a[k,i+1] c[k,i] = a[k,i+1] - out[k,i] = c[k,i] + t_private + out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") + knl = lp.set_temporary_scope(knl, "t_private_array", "private") ref_knl = knl @@ -1136,6 +1139,46 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) +def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,k]: 0<=i t_local[i % 8,k] = i % 8 + c[k,i] = a[k,i+1] + out[k,i] = c[k,i] + t_local[i % 8,k] + """) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + + knl = lp.set_temporary_scope(knl, "t_local", "local") + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") + + # schedule + from loopy.preprocess import preprocess_kernel + knl = preprocess_kernel(knl) + + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(knl) + + # map schedule onto host or device + print(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 2 + + print(cgr.device_code()) + print(cgr.host_code()) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + + def test_global_temporary(ctx_factory): ctx = ctx_factory()