Skip to content
Snippets Groups Projects
Commit 6f140ea6 authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Iname dependency cleanups.

- Flag idempotent instructions.

- Exploit idempotent insns in scheduling, allowing them to be executed
  inside "too many" loops.

- Be more exact in what inames to duplicate in CSE pre-computes.
parent 1b8354b2
No related branches found
No related tags found
No related merge requests found
......@@ -56,6 +56,8 @@ Things to consider
- Parallel dimension splitting/merging via tags
-> unnecessary?
- All user-supplied commands are assumed to be idempotent.
TODO
^^^^
......@@ -78,23 +80,20 @@ TODO
- Slab decomposition for ILP
- Some things involving CSEs might be impossible to schedule
a[i,j] = cse(b[i]) * cse(c[j])
- Flag, exploit idempotence
- How should we implement the dim shuffling for odd-size prefetches?
- Better for loop bound generation
-> Try a triangular loop
- AUTO_PICK or AUTO_FIT
- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT
Dealt with
^^^^^^^^^^
- Flag, exploit idempotence
- Some things involving CSEs might be impossible to schedule
a[i,j] = cse(b[i]) * cse(c[j])
- Be smarter about automatic local axis choice
-> What if we run out of axes?
- Implement condition hoisting
(needed, e.g., by slab decomposition)
......
......@@ -245,7 +245,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
if iname in duplicate_inames:
tag = dup_iname_to_tag[iname]
else:
tag = kernel.iname_to_tag[iname]
tag = kernel.iname_to_tag.get(iname)
if isinstance(tag, LocalIndexTagBase):
kind = "l"
......@@ -273,9 +273,22 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
"that the CSE does not depend on "
"does not make sense")
force_dependency = True
if kind == "l" and target_var_is_local:
force_dependency = False
# Which iname dependencies are carried over from CSE host
# to the CSE compute instruction?
if not target_var_is_local:
# If we're writing to a private variable, then each
# hardware-parallel iname must execute its own copy of
# the CSE compute instruction. After all, each work item
# has its own set of private variables.
force_dependency = kind in "gl"
else:
# If we're writing to a local variable, then all other local
# dimensions see our updates, and thus they do *not* need to
# execute their own copy of this instruction.
force_dependency = kind == "g"
if force_dependency:
forced_iname_deps.append(iname)
......@@ -308,7 +321,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
id=kernel.make_unique_instruction_id(based_on=cse_tag),
assignee=assignee,
expression=new_inner_expr,
forced_iname_deps=forced_iname_deps)
forced_iname_deps=forced_iname_deps,
idempotent=True)
cse_result_insns.append(new_insn)
......
......@@ -35,7 +35,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
sched_item = kernel.schedule[sched_index]
if isinstance(sched_item, EnterLoop):
tag = kernel.iname_to_tag[sched_item.iname]
tag = kernel.iname_to_tag.get(sched_item.iname)
from loopy.codegen.loop import (
generate_unroll_loop,
......
......@@ -220,16 +220,19 @@ class Instruction(Record):
:ivar insn_deps: a list of ids of :class:`Instruction` instances that
*must* be executed before this one. Note that loop scheduling augments this
by adding dependencies on any writes to temporaries read by this instruction.
:ivar idempotent: Whether the instruction may be executed repeatedly (while obeying
dependencies) without changing the meaning of the program.
"""
def __init__(self,
id, assignee, expression,
id, assignee, expression, idempotent,
forced_iname_deps=[], insn_deps=[]):
assert isinstance(idempotent, bool)
Record.__init__(self,
id=id, assignee=assignee, expression=expression,
forced_iname_deps=forced_iname_deps,
insn_deps=insn_deps,
)
insn_deps=insn_deps, idempotent=idempotent)
@memoize_method
def all_inames(self):
......@@ -447,7 +450,8 @@ class LoopKernel(Record):
id=self.make_unique_instruction_id(insns, based_on=label),
insn_deps=insn_deps,
forced_iname_deps=forced_iname_deps,
assignee=lhs, expression=rhs)
assignee=lhs, expression=rhs,
idempotent=True)
if isinstance(domain, str):
ctx = isl.Context()
......
......@@ -58,7 +58,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
extra_used_ids=set(ni.id for ni in new_insns)),
assignee=target_var,
forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
expression=expr.operation.neutral_element)
expression=expr.operation.neutral_element,
idempotent=True)
new_insns.append(init_insn)
......@@ -68,7 +69,8 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
assignee=target_var,
expression=expr.operation(target_var, sub_expr),
insn_deps=[init_insn.id],
forced_iname_deps=list(insn.all_inames()))
forced_iname_deps=list(insn.all_inames()),
idempotent=False)
new_insns.append(reduction_insn)
......@@ -208,9 +210,11 @@ def check_for_unused_hw_axes(kernel):
raise RuntimeError("auto local tag encountered")
if group_axes != group_axes_used:
raise RuntimeError("instruction '%s' does not use all hw group axes")
raise RuntimeError("instruction '%s' does not use all hw group axes"
% insn.id)
if local_axes != local_axes_used:
raise RuntimeError("instruction '%s' does not use all hw local axes")
raise RuntimeError("instruction '%s' does not use all hw local axes"
% insn.id)
......@@ -608,8 +612,25 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
for insn_id in unscheduled_insn_ids:
insn = kernel.id_to_insn[insn_id]
if (active_inames - parallel_inames
== insn.all_inames() - parallel_inames
if insn.idempotent:
# If insn is idempotent, it may be placed inside a more deeply
# nested loop without harm.
iname_deps_satisfied = (
insn.all_inames() - parallel_inames
<=
active_inames - parallel_inames)
else:
# If insn is not idempotent, we must insist that it is placed inside
# the exactly correct set of loops.
iname_deps_satisfied = (
insn.all_inames() - parallel_inames
==
active_inames - parallel_inames)
if (iname_deps_satisfied
and set(insn.insn_deps) <= scheduled_insn_ids):
scheduled_insn_ids.add(insn.id)
schedule = schedule + [RunInstruction(insn_id=insn.id)]
......@@ -812,12 +833,19 @@ def generate_loop_schedules(kernel):
check_for_double_use_of_hw_axes(kernel)
check_for_unused_hw_axes(kernel)
schedule_count = 0
for gen_sched in generate_loop_schedules_internal(kernel):
gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
assert not owed_barriers
yield kernel.copy(schedule=gen_sched)
schedule_count += 1
if not schedule_count:
raise RuntimeError("no valid schedules found")
# }}}
......
......@@ -244,6 +244,60 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
def test_rank_one(ctx_factory):
dtype = np.float32
ctx = ctx_factory()
order = "C"
queue = cl.CommandQueue(ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
n = int(get_suitable_size(ctx)**(3/2))
print n
knl = lp.LoopKernel(ctx.devices[0],
"[n] -> {[i,j]: 0<=i,j<n}",
[
"label: c[i, j] = cse(a[i], a)*cse(b[j], b)"
],
[
lp.ArrayArg("a", dtype, shape=(n,), order=order),
lp.ArrayArg("b", dtype, shape=(n,), order=order),
lp.ArrayArg("c", dtype, shape=(n, n), order=order),
lp.ScalarArg("n", np.int32, approximately=n),
],
name="rank_one", assumptions="n >= 16")
#knl = lp.split_dimension(knl, "i", 16,
#outer_tag="g.0", inner_tag="l.1", no_slabs=True)
#knl = lp.split_dimension(knl, "j", 8,
#outer_tag="g.1", inner_tag="l.0", no_slabs=True)
#knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"])
knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"])
kernel_gen = lp.generate_loop_schedules(knl)
kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
a = cl_random.rand(queue, n, dtype=dtype)
b = cl_random.rand(queue, n, dtype=dtype)
refsol = a.get()[:, np.newaxis] * b.get()
c = cl_array.empty(queue, refsol.shape, refsol.dtype)
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
g_times_l=True)
if check:
check_error(refsol, c.get())
return evt
lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
dtype = np.float32
ctx = ctx_factory()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment