diff --git a/MEMO b/MEMO index 24ddf973cf5174f79c8bc003de217da586591814..815d874235c804dfa3d7d8a3a8dc038cbc8902bf 100644 --- a/MEMO +++ b/MEMO @@ -6,10 +6,7 @@ For writeup: TODO: Reimplement forced lengths TODO: Try, fix reg. prefetch (DG example) / CSEs ILP and reg. prefetch interact! -TODO: Custom reductions per red. axis TODO: Functions -TODO: Common subexpressions -TODO: Array common subexpressions (shared and private!) TODO: ILP arrays FIXME: support non-reductive dimensions (what did I mean here?) FIXME: write names should be assigned during scheduling @@ -96,6 +93,8 @@ TODO Dealt with ^^^^^^^^^^ +- Check for non-use of hardware axes + - Slab decomposition for parallel dimensions - implement at the outermost nesting level regardless - bound *all* tagged inames diff --git a/loopy/schedule.py b/loopy/schedule.py index 0dbb8af7455963d53a611a97f87fe58f256116a4..55137d1d8b90b97a67eae6d9cf0a2969fc46a4db 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -100,14 +100,44 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): -def check_double_use_of_hw_dimensions(kernel): - from loopy.kernel import UniqueTag +def check_non_use_of_hw_axes(kernel): + group_size, local_size = kernel.get_grid_sizes_as_exprs() + + group_axes = set(range(len(group_size))) + local_axes = set(range(len(local_size))) + + from loopy.kernel import TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX, TAG_GROUP_IDX + for insn in kernel.instructions: + group_axes_used = set() + local_axes_used = set() + + for iname in insn.all_inames(): + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, TAG_LOCAL_IDX): + local_axes_used.add(tag.axis) + elif isinstance(tag, TAG_GROUP_IDX): + group_axes_used.add(tag.axis) + elif isinstance(tag, TAG_AUTO_LOCAL_IDX): + raise RuntimeError("auto local tag encountered") + + if group_axes != group_axes_used: + raise RuntimeError("instruction '%s' does not use all hw group axes") + if local_axes != local_axes_used: + raise RuntimeError("instruction '%s' does not use all hw local axes") + + + + + +def check_double_use_of_hw_axes(kernel): + from loopy.kernel import HardwareParallelTag for insn in kernel.instructions: insn_tag_keys = set() for iname in insn.all_inames(): tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, UniqueTag): + if isinstance(tag, HardwareParallelTag): key = tag.key if key in insn_tag_keys: raise RuntimeError("instruction '%s' has two " @@ -669,9 +699,7 @@ def insert_barriers(kernel, schedule, level=0): def generate_loop_schedules(kernel): kernel = realize_reduction(kernel) - - check_double_use_of_hw_dimensions(kernel) - + check_double_use_of_hw_axes(kernel) kernel = adjust_local_temp_var_storage(kernel) # {{{ check that all CSEs have been realized @@ -687,8 +715,8 @@ def generate_loop_schedules(kernel): # }}} kernel = add_automatic_dependencies(kernel) - kernel = assign_automatic_axes(kernel) + check_non_use_of_hw_axes(kernel) for gen_sched in generate_loop_schedules_internal(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)