diff --git a/MEMO b/MEMO index 0e3d65aad124a443f4e99260fac8ff931f954584..a5cbf93f05ce422468529ba8bfa140de3b713e4b 100644 --- a/MEMO +++ b/MEMO @@ -88,6 +88,10 @@ TODO - Better for loop bound generation -> Try a triangular loop +- AUTO_PICK or AUTO_FIT + +- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT + Dealt with ^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index ce40f0d2b98dd139a41ac27a50e40d289173c83d..d3adafe88ce31bc54dd0bfddf62135155bbf098e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,5 +1,4 @@ from __future__ import division -import isl def register_mpz_with_pymbolic(): from pymbolic.primitives import register_constant_class @@ -28,7 +27,7 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ImageArg -from loopy.kernel import LoopKernel +from loopy.kernel import LoopKernel, AutoFitLocalIndexTag from loopy.schedule import generate_loop_schedules from loopy.compiled import CompiledKernel, drive_timing_run @@ -63,7 +62,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, s.set_dim_name(dim_type.set, outer_var_nr, outer_iname) s.set_dim_name(dim_type.set, inner_var_nr, inner_iname) - from loopy.isl import make_slab + from loopy.isl_helpers import make_slab space = s.get_space() inner_constraint_set = ( @@ -156,7 +155,7 @@ def tag_dimensions(kernel, iname_to_tag): def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=None, - dup_iname_to_tag={}, new_inames=None): + dup_iname_to_tag={}, new_inames=None, default_tag_class=AutoFitLocalIndexTag): """ :arg duplicate_inames: which inames are supposed to be separate loops in the CSE. Also determines index order of temporary array. @@ -174,9 +173,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non parallel_inames = duplicate_inames dup_iname_to_tag = dup_iname_to_tag.copy() - from loopy.kernel import TAG_AUTO_LOCAL_IDX for piname in parallel_inames: - dup_iname_to_tag[piname] = TAG_AUTO_LOCAL_IDX() + dup_iname_to_tag[piname] = default_tag_class() for diname in duplicate_inames: dup_iname_to_tag.setdefault(diname, None) @@ -208,10 +206,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non target_var_name = kernel.make_unique_var_name(cse_tag) - from loopy.kernel import (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX, - TAG_GROUP_IDX) + from loopy.kernel import (LocalIndexTagBase, GroupIndexTag) target_var_is_local = any( - isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)) + isinstance(tag, LocalIndexTagBase) for tag in dup_iname_to_tag.itervalues()) cse_lookup_table = {} @@ -234,7 +231,6 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non if cse_result_insns: raise RuntimeError("CSE tag '%s' is not unique" % cse_tag) - # {{{ decide what to do with each iname parent_inames = insn.all_inames() @@ -251,9 +247,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non else: tag = kernel.iname_to_tag[iname] - if isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)): + if isinstance(tag, LocalIndexTagBase): kind = "l" - elif isinstance(tag, TAG_GROUP_IDX): + elif isinstance(tag, GroupIndexTag): kind = "g" else: kind = "o" @@ -389,7 +385,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non lower_bound_pw_aff = new_domain.dim_min(new_iname_to_dim[iname][1]) upper_bound_pw_aff = new_domain.dim_max(new_iname_to_dim[iname][1]) - from loopy.isl import static_max_of_pw_aff + from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr target_var_shape.append(static_max_of_pw_aff( diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index f022e60983eb9fd4b95f6301fe74ae24105863ad..fee274b465c2fc305ccc5b9afb0999d4c33ae65e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -134,17 +134,17 @@ def make_initial_assignments(kernel): global_size, local_size = kernel.get_grid_sizes() - from loopy.kernel import TAG_LOCAL_IDX, TAG_GROUP_IDX + from loopy.kernel import LocalIndexTag, GroupIndexTag from pymbolic import var for iname in kernel.all_inames(): tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, TAG_LOCAL_IDX): + if isinstance(tag, LocalIndexTag): hw_axis_expr = var("lid")(tag.axis) hw_axis_size = local_size[tag.axis] - elif isinstance(tag, TAG_GROUP_IDX): + elif isinstance(tag, GroupIndexTag): hw_axis_expr = var("gid")(tag.axis) hw_axis_size = global_size[tag.axis] diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index e3cd775a435d67831a9bf6355bfa97d186b3bcdd..b79ce784c9fe83fae0b260d62d8d9a87a040d4e7 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -12,7 +12,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index): inames if there is a barrier nested somewhere within. """ - from loopy.kernel import TAG_LOCAL_IDX, ParallelTag + from loopy.kernel import LocalIndexTag, ParallelTag from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) @@ -21,7 +21,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index): for iname, tag in kernel.iname_to_tag.iteritems(): if isinstance(tag, ParallelTag): - if not has_barrier or not isinstance(tag, TAG_LOCAL_IDX): + if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) return result @@ -41,8 +41,8 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): generate_unroll_loop, generate_sequential_loop_dim_code) - from loopy.kernel import TAG_UNROLL, SequentialTag - if isinstance(tag, TAG_UNROLL): + from loopy.kernel import UnrollTag, SequentialTag + if isinstance(tag, UnrollTag): func = generate_unroll_loop elif tag is None or isinstance(tag, SequentialTag): func = generate_sequential_loop_dim_code @@ -213,7 +213,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # group only contains starting schedule item result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)] else: - # recurse with a bigger iname count + # recurse with a bigger minimum iname count result = build_insn_group(sched_indices_and_cond_inames[0:idx], new_codegen_state, len(current_iname_set)+1) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 1d1bc55d517e12e13b70909894d5c91adcf6af90..3fd65c4351996678e4f60a0cda0c237392b93c79 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -43,7 +43,7 @@ class ILPInstance(Record): def generate_ilp_instances(kernel, insn, codegen_state): impl_domain = codegen_state.implemented_domain - from loopy.kernel import TAG_ILP + from loopy.kernel import IlpTag result = [ILPInstance(impl_domain, {}, frozenset())] @@ -52,7 +52,7 @@ def generate_ilp_instances(kernel, insn, codegen_state): for iname in insn.all_inames(): tag = kernel.iname_to_tag.get(iname) - if not isinstance(tag, TAG_ILP): + if not isinstance(tag, IlpTag): continue from warnings import warn diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 20405c0ff97c738b87136bc454c3b2b7bbda5e49..14d6c586263c685f1f23c4e15f25dd5d760f37d3 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -32,7 +32,7 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition def get_slab_decomposition(kernel, iname, sched_index, codegen_state): - from loopy.isl import block_shift_constraint, negate_constraint + from loopy.isl_helpers import block_shift_constraint, negate_constraint ccm = codegen_state.c_code_mapper space = kernel.space @@ -82,7 +82,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): # {{{ unrolled/ILP loops def generate_unroll_loop(kernel, sched_index, codegen_state): - from loopy.isl import block_shift_constraint + from loopy.isl_helpers import block_shift_constraint from cgen import (POD, Line) @@ -95,7 +95,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): codegen_state.implemented_domain) bounds = kernel.get_iname_bounds(iname) - from loopy.isl import static_max_of_pw_aff + from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr length = int(pw_aff_to_expr(static_max_of_pw_aff(bounds.length))) @@ -114,8 +114,8 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): block_shift_constraint( lower_cns, iname, -i, as_equality=True))) - from loopy.kernel import TAG_UNROLL - if isinstance(tag, TAG_UNROLL): + from loopy.kernel import UnrollTag + if isinstance(tag, UnrollTag): result = [POD(np.int32, iname), Line()] for i in range(length): @@ -134,7 +134,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): # {{{ parallel loop def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): - from loopy.kernel import UniqueTag, HardwareParallelTag, TAG_LOCAL_IDX, TAG_GROUP_IDX + from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag if hw_inames_left is None: hw_inames_left = [iname @@ -160,9 +160,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= # {{{ 'implement' hardware axis boundaries - if isinstance(tag, TAG_LOCAL_IDX): + if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] - elif isinstance(tag, TAG_GROUP_IDX): + elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") @@ -171,7 +171,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= bounds = kernel.get_iname_bounds(iname) - from loopy.isl import make_slab + from loopy.isl_helpers import make_slab slab = make_slab(kernel.space, iname, bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size) codegen_state = codegen_state.intersect(slab) diff --git a/loopy/compiled.py b/loopy/compiled.py index 85daa16121cebd2a3e827dd643852cc02c3f82f0..a263ea8f2a0326bca77c56c84ba30ff7272da7e6 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -48,7 +48,7 @@ class CompiledKernel: else: self.size_args = size_args - from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX + from loopy.kernel import GroupIndexTag, LocalIndexTag gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs() if not gsize_expr: gsize_expr = (1,) diff --git a/loopy/isl.py b/loopy/isl_helpers.py similarity index 100% rename from loopy/isl.py rename to loopy/isl_helpers.py diff --git a/loopy/kernel.py b/loopy/kernel.py index 0da1d56714406193f23693965fa5a906834d0a2c..dcf14172d7879bc97c1442ed14901b8c180c73c6 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -30,12 +30,15 @@ class SequentialTag(IndexTag): class ParallelTag(IndexTag): pass +class HardwareParallelTag(ParallelTag): + pass + class UniqueTag(IndexTag): @property def key(self): return type(self) -class HardwareParallelTag(ParallelTag, UniqueTag): +class AxisTag(UniqueTag): __slots__ = ["axis"] def __init__(self, axis): @@ -50,27 +53,31 @@ class HardwareParallelTag(ParallelTag, UniqueTag): return "%s.%d" % ( self.print_name, self.axis) -#class MultiTag(IndexTag): - -#class SplitTag(IndexTag): - - - -class TAG_GROUP_IDX(HardwareParallelTag): +class GroupIndexTag(HardwareParallelTag, AxisTag): print_name = "g" -class TAG_LOCAL_IDX(HardwareParallelTag): +class LocalIndexTagBase(HardwareParallelTag): + pass + +class LocalIndexTag(LocalIndexTagBase, AxisTag): print_name = "l" -class TAG_AUTO_LOCAL_IDX(ParallelTag): +class AutoLocalIndexTagBase(LocalIndexTagBase): + pass + +class AutoFitLocalIndexTag(AutoLocalIndexTagBase): def __str__(self): - return "l.auto" + return "l.fit" -class TAG_ILP(ParallelTag): +class AutoPickLocalIndexTag(AutoLocalIndexTagBase): + def __str__(self): + return "l.pick" + +class IlpTag(ParallelTag): def __str__(self): return "ilp" -class TAG_UNROLL(IndexTag): +class UnrollTag(IndexTag): def __str__(self): return "unr" @@ -85,13 +92,19 @@ def parse_tag(tag): raise ValueError("cannot parse tag: %s" % tag) if tag in ["unr"]: - return TAG_UNROLL() + return UnrollTag() elif tag == "ilp": - return TAG_ILP() + return IlpTag() elif tag.startswith("g."): - return TAG_GROUP_IDX(int(tag[2:])) + return GroupIndexTag(int(tag[2:])) elif tag.startswith("l."): - return TAG_LOCAL_IDX(int(tag[2:])) + axis = tag[2:] + if axis == "pick": + return AutoPickLocalIndexTag() + elif axis == "fit": + return AutoFitLocalIndexTag() + else: + return LocalIndexTag(int(axis)) else: raise ValueError("cannot parse tag: %s" % tag) @@ -593,7 +606,7 @@ class LoopKernel(Record): size=size) @memoize_method - def get_grid_sizes(self): + def get_grid_sizes(self, ignore_auto=False): all_inames_by_insns = set() for insn in self.instructions: all_inames_by_insns |= insn.all_inames() @@ -606,18 +619,18 @@ class LoopKernel(Record): local_sizes = {} from loopy.kernel import ( - TAG_GROUP_IDX, TAG_LOCAL_IDX, - TAG_AUTO_LOCAL_IDX) + GroupIndexTag, LocalIndexTag, + AutoLocalIndexTagBase) for iname in self.all_inames(): tag = self.iname_to_tag.get(iname) - if isinstance(tag, TAG_GROUP_IDX): + if isinstance(tag, GroupIndexTag): tgt_dict = global_sizes - elif isinstance(tag, TAG_LOCAL_IDX): + elif isinstance(tag, LocalIndexTag): tgt_dict = local_sizes - elif isinstance(tag, TAG_AUTO_LOCAL_IDX): - raise RuntimeError("cannot find grid sizes if AUTO_LOCAL_IDX tags are " + elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto: + raise RuntimeError("cannot find grid sizes if automatic local index tags are " "present") else: tgt_dict = None @@ -630,11 +643,11 @@ class LoopKernel(Record): if tag.axis in tgt_dict: size = tgt_dict[tag.axis].max(size) - from loopy.isl import static_max_of_pw_aff + from loopy.isl_helpers import static_max_of_pw_aff try: # insist block size is constant size = static_max_of_pw_aff(size, - constants_only=isinstance(tag, TAG_LOCAL_IDX)) + constants_only=isinstance(tag, LocalIndexTag)) except ValueError: pass diff --git a/loopy/schedule.py b/loopy/schedule.py index 5cbf88471067c6cffe4c58b9123e9e6549dda1ea..e0c36afb50d10710fa1ff81a5d5921b789f90003 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -100,13 +100,13 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): -def check_non_use_of_hw_axes(kernel): +def check_for_unused_hw_axes(kernel): group_size, local_size = kernel.get_grid_sizes_as_exprs() group_axes = set(range(len(group_size))) local_axes = set(range(len(local_size))) - from loopy.kernel import TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX, TAG_GROUP_IDX + from loopy.kernel import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag for insn in kernel.instructions: group_axes_used = set() local_axes_used = set() @@ -114,11 +114,11 @@ def check_non_use_of_hw_axes(kernel): for iname in insn.all_inames(): tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, TAG_LOCAL_IDX): + if isinstance(tag, LocalIndexTag): local_axes_used.add(tag.axis) - elif isinstance(tag, TAG_GROUP_IDX): + elif isinstance(tag, GroupIndexTag): group_axes_used.add(tag.axis) - elif isinstance(tag, TAG_AUTO_LOCAL_IDX): + elif isinstance(tag, AutoLocalIndexTagBase): raise RuntimeError("auto local tag encountered") if group_axes != group_axes_used: @@ -131,13 +131,13 @@ def check_non_use_of_hw_axes(kernel): def check_double_use_of_hw_axes(kernel): - from loopy.kernel import HardwareParallelTag + from loopy.kernel import UniqueTag for insn in kernel.instructions: insn_tag_keys = set() for iname in insn.all_inames(): tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag): + if isinstance(tag, UniqueTag): key = tag.key if key in insn_tag_keys: raise RuntimeError("instruction '%s' has two " @@ -398,7 +398,7 @@ def find_inadmissible_tag_keys(kernel, iname, iname_to_tag=None): def assign_automatic_axes(kernel): from loopy.kernel import ( - TAG_AUTO_LOCAL_IDX, TAG_LOCAL_IDX) + TAG_AUTO_LOCAL_IDX, LocalIndexTag) new_iname_to_tag = kernel.iname_to_tag @@ -418,14 +418,14 @@ def assign_automatic_axes(kernel): for iname in insn.all_inames(): tag = new_iname_to_tag.get(iname) - if isinstance(tag, TAG_LOCAL_IDX): + if isinstance(tag, LocalIndexTag): local_assigned_axes.add(tag.axis) if 0 not in local_assigned_axes: axis0_iname = guess_good_iname_for_axis_0(kernel, insn) axis0_iname_tag = new_iname_to_tag.get(axis0_iname) - ax0_tag = TAG_LOCAL_IDX(0) + ax0_tag = LocalIndexTag(0) if (isinstance(axis0_iname_tag, TAG_AUTO_LOCAL_IDX) and ax0_tag.key not in find_inadmissible_tag_keys( kernel, axis0_iname, new_iname_to_tag)): @@ -442,7 +442,7 @@ def assign_automatic_axes(kernel): while next_axis in local_assigned_axes: next_axis += 1 - new_iname_to_tag[iname] = TAG_LOCAL_IDX(next_axis) + new_iname_to_tag[iname] = LocalIndexTag(next_axis) local_assigned_axes.add(next_axis) return kernel.copy(iname_to_tag=new_iname_to_tag) @@ -716,7 +716,7 @@ def generate_loop_schedules(kernel): kernel = add_automatic_dependencies(kernel) kernel = assign_automatic_axes(kernel) - check_non_use_of_hw_axes(kernel) + check_for_unused_hw_axes(kernel) for gen_sched in generate_loop_schedules_internal(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) diff --git a/test/test_matmul.py b/test/test_matmul.py index 71b1d409e1f2d95c225544be1cf92c30f2a5947a..f861153c80e3aece10e8300a78b9591187fbf397 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -215,9 +215,9 @@ def test_plain_matrix_mul_new_ui(ctx_factory): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", no_slabs=True) - knl = lp.split_dimension(knl, "j", 16, - outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 16) + knl = lp.split_dimension(knl, "j", 8, + outer_tag="g.1", inner_tag="l.0", no_slabs=True) + knl = lp.split_dimension(knl, "k", 32, no_slabs=True) knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"]) knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])