From 6f140ea6d0567b601918662ef744dbd683fd1672 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Mon, 17 Oct 2011 01:27:12 -0400 Subject: [PATCH] Iname dependency cleanups. - Flag idempotent instructions. - Exploit idempotent insns in scheduling, allowing them to be executed inside "too many" loops. - Be more exact in what inames to duplicate in CSE pre-computes. --- MEMO | 21 ++++++++------- loopy/__init__.py | 24 +++++++++++++---- loopy/codegen/dispatch.py | 2 +- loopy/kernel.py | 12 ++++++--- loopy/schedule.py | 40 ++++++++++++++++++++++++----- test/test_matmul.py | 54 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 27 deletions(-) diff --git a/MEMO b/MEMO index a5cbf93f0..facaab5a8 100644 --- a/MEMO +++ b/MEMO @@ -56,6 +56,8 @@ Things to consider - Parallel dimension splitting/merging via tags -> unnecessary? +- All user-supplied commands are assumed to be idempotent. + TODO ^^^^ @@ -78,23 +80,20 @@ TODO - Slab decomposition for ILP -- Some things involving CSEs might be impossible to schedule - a[i,j] = cse(b[i]) * cse(c[j]) - -- Flag, exploit idempotence - -- How should we implement the dim shuffling for odd-size prefetches? - - Better for loop bound generation -> Try a triangular loop -- AUTO_PICK or AUTO_FIT - -- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT - Dealt with ^^^^^^^^^^ +- Flag, exploit idempotence + +- Some things involving CSEs might be impossible to schedule + a[i,j] = cse(b[i]) * cse(c[j]) + +- Be smarter about automatic local axis choice + -> What if we run out of axes? + - Implement condition hoisting (needed, e.g., by slab decomposition) diff --git a/loopy/__init__.py b/loopy/__init__.py index d3adafe88..784bc99cf 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -245,7 +245,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non if iname in duplicate_inames: tag = dup_iname_to_tag[iname] else: - tag = kernel.iname_to_tag[iname] + tag = kernel.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTagBase): kind = "l" @@ -273,9 +273,22 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non "that the CSE does not depend on " "does not make sense") - force_dependency = True - if kind == "l" and target_var_is_local: - force_dependency = False + # Which iname dependencies are carried over from CSE host + # to the CSE compute instruction? + + if not target_var_is_local: + # If we're writing to a private variable, then each + # hardware-parallel iname must execute its own copy of + # the CSE compute instruction. After all, each work item + # has its own set of private variables. + + force_dependency = kind in "gl" + else: + # If we're writing to a local variable, then all other local + # dimensions see our updates, and thus they do *not* need to + # execute their own copy of this instruction. + + force_dependency = kind == "g" if force_dependency: forced_iname_deps.append(iname) @@ -308,7 +321,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non id=kernel.make_unique_instruction_id(based_on=cse_tag), assignee=assignee, expression=new_inner_expr, - forced_iname_deps=forced_iname_deps) + forced_iname_deps=forced_iname_deps, + idempotent=True) cse_result_insns.append(new_insn) diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index b79ce784c..ae6144cee 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -35,7 +35,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): sched_item = kernel.schedule[sched_index] if isinstance(sched_item, EnterLoop): - tag = kernel.iname_to_tag[sched_item.iname] + tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import ( generate_unroll_loop, diff --git a/loopy/kernel.py b/loopy/kernel.py index 699b159ff..0d9ea7cf0 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -220,16 +220,19 @@ class Instruction(Record): :ivar insn_deps: a list of ids of :class:`Instruction` instances that *must* be executed before this one. Note that loop scheduling augments this by adding dependencies on any writes to temporaries read by this instruction. + :ivar idempotent: Whether the instruction may be executed repeatedly (while obeying + dependencies) without changing the meaning of the program. """ def __init__(self, - id, assignee, expression, + id, assignee, expression, idempotent, forced_iname_deps=[], insn_deps=[]): + assert isinstance(idempotent, bool) + Record.__init__(self, id=id, assignee=assignee, expression=expression, forced_iname_deps=forced_iname_deps, - insn_deps=insn_deps, - ) + insn_deps=insn_deps, idempotent=idempotent) @memoize_method def all_inames(self): @@ -447,7 +450,8 @@ class LoopKernel(Record): id=self.make_unique_instruction_id(insns, based_on=label), insn_deps=insn_deps, forced_iname_deps=forced_iname_deps, - assignee=lhs, expression=rhs) + assignee=lhs, expression=rhs, + idempotent=True) if isinstance(domain, str): ctx = isl.Context() diff --git a/loopy/schedule.py b/loopy/schedule.py index 7afd6bb87..3d7a329ce 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -58,7 +58,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, forced_iname_deps=list(insn.all_inames() - set(expr.inames)), - expression=expr.operation.neutral_element) + expression=expr.operation.neutral_element, + idempotent=True) new_insns.append(init_insn) @@ -68,7 +69,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): assignee=target_var, expression=expr.operation(target_var, sub_expr), insn_deps=[init_insn.id], - forced_iname_deps=list(insn.all_inames())) + forced_iname_deps=list(insn.all_inames()), + idempotent=False) new_insns.append(reduction_insn) @@ -208,9 +210,11 @@ def check_for_unused_hw_axes(kernel): raise RuntimeError("auto local tag encountered") if group_axes != group_axes_used: - raise RuntimeError("instruction '%s' does not use all hw group axes") + raise RuntimeError("instruction '%s' does not use all hw group axes" + % insn.id) if local_axes != local_axes_used: - raise RuntimeError("instruction '%s' does not use all hw local axes") + raise RuntimeError("instruction '%s' does not use all hw local axes" + % insn.id) @@ -608,8 +612,25 @@ def generate_loop_schedules_internal(kernel, schedule=[]): for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] - if (active_inames - parallel_inames - == insn.all_inames() - parallel_inames + + if insn.idempotent: + # If insn is idempotent, it may be placed inside a more deeply + # nested loop without harm. + + iname_deps_satisfied = ( + insn.all_inames() - parallel_inames + <= + active_inames - parallel_inames) + else: + # If insn is not idempotent, we must insist that it is placed inside + # the exactly correct set of loops. + + iname_deps_satisfied = ( + insn.all_inames() - parallel_inames + == + active_inames - parallel_inames) + + if (iname_deps_satisfied and set(insn.insn_deps) <= scheduled_insn_ids): scheduled_insn_ids.add(insn.id) schedule = schedule + [RunInstruction(insn_id=insn.id)] @@ -812,12 +833,19 @@ def generate_loop_schedules(kernel): check_for_double_use_of_hw_axes(kernel) check_for_unused_hw_axes(kernel) + schedule_count = 0 + for gen_sched in generate_loop_schedules_internal(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) assert not owed_barriers yield kernel.copy(schedule=gen_sched) + schedule_count += 1 + + if not schedule_count: + raise RuntimeError("no valid schedules found") + # }}} diff --git a/test/test_matmul.py b/test/test_matmul.py index f861153c8..c6797c6bc 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -244,6 +244,60 @@ def test_plain_matrix_mul_new_ui(ctx_factory): +def test_rank_one(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = int(get_suitable_size(ctx)**(3/2)) + print n + + knl = lp.LoopKernel(ctx.devices[0], + "[n] -> {[i,j]: 0<=i,j<n}", + [ + "label: c[i, j] = cse(a[i], a)*cse(b[j], b)" + ], + [ + lp.ArrayArg("a", dtype, shape=(n,), order=order), + lp.ArrayArg("b", dtype, shape=(n,), order=order), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + lp.ScalarArg("n", np.int32, approximately=n), + ], + name="rank_one", assumptions="n >= 16") + + #knl = lp.split_dimension(knl, "i", 16, + #outer_tag="g.0", inner_tag="l.1", no_slabs=True) + #knl = lp.split_dimension(knl, "j", 8, + #outer_tag="g.1", inner_tag="l.0", no_slabs=True) + #knl = lp.split_dimension(knl, "k", 32, no_slabs=True) + + knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"]) + knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"]) + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6) + + a = cl_random.rand(queue, n, dtype=dtype) + b = cl_random.rand(queue, n, dtype=dtype) + refsol = a.get()[:, np.newaxis] * b.get() + c = cl_array.empty(queue, refsol.shape, refsol.dtype) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, n**2) + + + + def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() -- GitLab