Skip to content
Snippets Groups Projects
Commit 2513ee98 authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Automatically find idempotent instructions. Minor tweaks.

parent 6f140ea6
No related branches found
No related tags found
No related merge requests found
......@@ -56,7 +56,8 @@ Things to consider
- Parallel dimension splitting/merging via tags
-> unnecessary?
- All user-supplied commands are assumed to be idempotent.
- Not using all hw loop dimensions causes an error, as
is the case for variant 3 in the rank_one test.
TODO
^^^^
......@@ -83,6 +84,9 @@ TODO
- Better for loop bound generation
-> Try a triangular loop
- Nested slab decomposition (in conjunction with conditional hoisting) could
generate nested conditional code.
Dealt with
^^^^^^^^^^
......
......@@ -38,14 +38,11 @@ from loopy.compiled import CompiledKernel, drive_timing_run
def split_dimension(kernel, iname, inner_length, padded_length=None,
outer_iname=None, inner_iname=None,
outer_tag=None, inner_tag=None,
outer_slab_increments=(0, -1), no_slabs=None):
slabs=(0, 0)):
if iname not in kernel.all_inames():
raise ValueError("cannot split loop for unknown variable '%s'" % iname)
if no_slabs:
outer_slab_increments = (0, 0)
if padded_length is not None:
inner_tag = inner_tag.copy(forced_length=padded_length)
......@@ -115,7 +112,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
# }}}
iname_slab_increments = kernel.iname_slab_increments.copy()
iname_slab_increments[outer_iname] = outer_slab_increments
iname_slab_increments[outer_iname] = slabs
result = (kernel
.copy(domain=new_domain,
iname_slab_increments=iname_slab_increments,
......@@ -321,8 +318,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
id=kernel.make_unique_instruction_id(based_on=cse_tag),
assignee=assignee,
expression=new_inner_expr,
forced_iname_deps=forced_iname_deps,
idempotent=True)
forced_iname_deps=forced_iname_deps)
cse_result_insns.append(new_insn)
......
......@@ -224,10 +224,8 @@ class Instruction(Record):
dependencies) without changing the meaning of the program.
"""
def __init__(self,
id, assignee, expression, idempotent,
forced_iname_deps=[], insn_deps=[]):
assert isinstance(idempotent, bool)
id, assignee, expression,
forced_iname_deps=[], insn_deps=[], idempotent=None):
Record.__init__(self,
id=id, assignee=assignee, expression=expression,
......@@ -258,6 +256,15 @@ class Instruction(Record):
result = "%s: %s <- %s\n [%s]" % (self.id,
self.assignee, self.expression, ", ".join(sorted(self.all_inames())))
if self.idempotent == True:
result += " (idempotent)"
elif self.idempotent == False:
result += " (not idempotent)"
elif self.idempotent is None:
result += " (idempotence unknown)"
else:
raise RuntimeError("unexpected value for Instruction.idempotent")
if self.insn_deps:
result += "\n : " + ", ".join(self.insn_deps)
......@@ -450,8 +457,7 @@ class LoopKernel(Record):
id=self.make_unique_instruction_id(insns, based_on=label),
insn_deps=insn_deps,
forced_iname_deps=forced_iname_deps,
assignee=lhs, expression=rhs,
idempotent=True)
assignee=lhs, expression=rhs)
if isinstance(domain, str):
ctx = isl.Context()
......
......@@ -58,8 +58,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
extra_used_ids=set(ni.id for ni in new_insns)),
assignee=target_var,
forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
expression=expr.operation.neutral_element,
idempotent=True)
expression=expr.operation.neutral_element)
new_insns.append(init_insn)
......@@ -69,8 +68,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
assignee=target_var,
expression=expr.operation(target_var, sub_expr),
insn_deps=[init_insn.id],
forced_iname_deps=list(insn.all_inames()),
idempotent=False)
forced_iname_deps=list(insn.all_inames()))
new_insns.append(reduction_insn)
......@@ -210,10 +208,10 @@ def check_for_unused_hw_axes(kernel):
raise RuntimeError("auto local tag encountered")
if group_axes != group_axes_used:
raise RuntimeError("instruction '%s' does not use all hw group axes"
raise RuntimeError("instruction '%s' does not use all group hw axes"
% insn.id)
if local_axes != local_axes_used:
raise RuntimeError("instruction '%s' does not use all hw local axes"
raise RuntimeError("instruction '%s' does not use all local hw axes"
% insn.id)
......@@ -305,53 +303,65 @@ def adjust_local_temp_var_storage(kernel):
# }}}
# {{{ automatic dependencies
# {{{ automatic dependencies, find idempotent instructions
def find_writers(kernel):
def find_accessors(kernel, readers):
"""
:return: a dict that maps variable names to ids of insns that
write to that variable.
"""
writer_insn_ids = {}
result = {}
admissible_write_vars = (
admissible_vars = (
set(arg.name for arg in kernel.args)
| set(kernel.temporary_variables.iterkeys()))
for insn in kernel.instructions:
var_name = insn.get_assignee_var_name()
if var_name not in admissible_write_vars:
raise RuntimeError("writing to '%s' is not allowed" % var_name)
if readers:
from loopy.symbolic import DependencyMapper
var_names = DependencyMapper()(insn.expression) & admissible_vars
else:
var_name = insn.get_assignee_var_name()
writer_insn_ids.setdefault(var_name, set()).add(insn.id)
if var_name not in admissible_vars:
raise RuntimeError("writing to '%s' is not allowed" % var_name)
var_names = [var_name]
return writer_insn_ids
for var_name in var_names:
result.setdefault(var_name, set()).add(insn.id)
return result
def add_automatic_dependencies(kernel):
writer_map = find_writers(kernel)
def add_idempotence_and_automatic_dependencies(kernel):
writer_map = find_accessors(kernel, readers=False)
arg_names = set(arg.name for arg in kernel.args)
var_names = arg_names | set(kernel.temporary_variables.iterkeys())
from loopy.symbolic import DependencyMapper
dep_map = DependencyMapper(composite_leaves=False)
new_insns = []
dm = DependencyMapper(composite_leaves=False)
dep_map = {}
for insn in kernel.instructions:
read_vars = (
set(var.name for var in dep_map(insn.expression))
dep_map[insn.id] = (
set(var.name for var in dm(insn.expression))
& var_names)
new_insns = []
for insn in kernel.instructions:
auto_deps = []
for var in read_vars:
# {{{ add automatic dependencies
all_my_var_writers = set()
for var in dep_map[insn.id]:
var_writers = writer_map.get(var, set())
all_my_var_writers |= var_writers
if not var_writers and var not in var_names:
if not var_writers and var not in arg_names:
from warnings import warn
warn("'%s' is read, but never written." % var)
......@@ -365,9 +375,26 @@ def add_automatic_dependencies(kernel):
if len(var_writers) == 1:
auto_deps.extend(var_writers)
# }}}
# {{{ find dependency loops, flag idempotence
while True:
last_all_my_var_writers = all_my_var_writers
for writer_insn_id in last_all_my_var_writers:
for var in dep_map[writer_insn_id]:
all_my_var_writers = all_my_var_writers | writer_map.get(var, set())
if last_all_my_var_writers == all_my_var_writers:
break
# }}}
new_insns.append(
insn.copy(
insn_deps=insn.insn_deps + auto_deps))
insn_deps=insn.insn_deps + auto_deps,
idempotent=insn.id not in all_my_var_writers))
return kernel.copy(instructions=new_insns)
......@@ -514,7 +541,7 @@ def assign_automatic_axes(kernel, only_axis_0=True):
from loopy import split_dimension
return assign_automatic_axes(
split_dimension(kernel, iname, inner_length=local_size[axis],
outer_tag=UnrollTag(), inner_tag=new_tag, no_slabs=True),
outer_tag=UnrollTag(), inner_tag=new_tag),
only_axis_0=only_axis_0)
new_iname_to_tag = kernel.iname_to_tag.copy()
......@@ -613,7 +640,7 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
for insn_id in unscheduled_insn_ids:
insn = kernel.id_to_insn[insn_id]
if insn.idempotent:
if insn.idempotent == True:
# If insn is idempotent, it may be placed inside a more deeply
# nested loop without harm.
......@@ -621,7 +648,8 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
insn.all_inames() - parallel_inames
<=
active_inames - parallel_inames)
else:
elif insn.idempotent == False:
# If insn is not idempotent, we must insist that it is placed inside
# the exactly correct set of loops.
......@@ -630,6 +658,10 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
==
active_inames - parallel_inames)
else:
raise RuntimeError("instruction '%s' has undetermined idempotence"
% insn.id)
if (iname_deps_satisfied
and set(insn.insn_deps) <= scheduled_insn_ids):
scheduled_insn_ids.add(insn.id)
......@@ -782,7 +814,7 @@ def insert_barriers(kernel, schedule, level=0):
# {{{ issue dependency-based barriers for this instruction
if insn.id in owed_barriers:
if set(insn.insn_deps) & owed_barriers:
issue_barrier(is_pre_barrier=False)
# }}}
......@@ -827,7 +859,7 @@ def generate_loop_schedules(kernel):
# }}}
kernel = assign_automatic_axes(kernel)
kernel = add_automatic_dependencies(kernel)
kernel = add_idempotence_and_automatic_dependencies(kernel)
kernel = adjust_local_temp_var_storage(kernel)
check_for_double_use_of_hw_axes(kernel)
......
from __future__ import division
import numpy as np
import numpy.linalg as la
import pyopencl as cl
......@@ -214,16 +216,16 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
name="matmul", assumptions="n >= 16")
knl = lp.split_dimension(knl, "i", 16,
outer_tag="g.0", inner_tag="l.1", no_slabs=True)
outer_tag="g.0", inner_tag="l.1")
knl = lp.split_dimension(knl, "j", 8,
outer_tag="g.1", inner_tag="l.0", no_slabs=True)
knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
outer_tag="g.1", inner_tag="l.0")
knl = lp.split_dimension(knl, "k", 32)
knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"])
knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])
kernel_gen = lp.generate_loop_schedules(knl)
kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
......@@ -251,8 +253,7 @@ def test_rank_one(ctx_factory):
queue = cl.CommandQueue(ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
n = int(get_suitable_size(ctx)**(3/2))
print n
n = int(get_suitable_size(ctx)**(2.7/2))
knl = lp.LoopKernel(ctx.devices[0],
"[n] -> {[i,j]: 0<=i,j<n}",
......@@ -267,33 +268,71 @@ def test_rank_one(ctx_factory):
],
name="rank_one", assumptions="n >= 16")
#knl = lp.split_dimension(knl, "i", 16,
#outer_tag="g.0", inner_tag="l.1", no_slabs=True)
#knl = lp.split_dimension(knl, "j", 8,
#outer_tag="g.1", inner_tag="l.0", no_slabs=True)
#knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"])
knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"])
kernel_gen = lp.generate_loop_schedules(knl)
kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6)
a = cl_random.rand(queue, n, dtype=dtype)
b = cl_random.rand(queue, n, dtype=dtype)
refsol = a.get()[:, np.newaxis] * b.get()
c = cl_array.empty(queue, refsol.shape, refsol.dtype)
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
g_times_l=True)
if check:
check_error(refsol, c.get())
return evt
lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
def variant_1(knl):
knl = lp.realize_cse(knl, "a", dtype)
knl = lp.realize_cse(knl, "b", dtype)
return knl
def variant_2(knl):
knl = lp.split_dimension(knl, "i", 16,
outer_tag="g.0", inner_tag="l.0")
knl = lp.split_dimension(knl, "j", 16,
outer_tag="g.1", inner_tag="l.1")
knl = lp.realize_cse(knl, "a", dtype)
knl = lp.realize_cse(knl, "b", dtype)
return knl
def variant_3(knl):
knl = lp.split_dimension(knl, "i", 16,
outer_tag="g.0", inner_tag="l.0")
knl = lp.split_dimension(knl, "j", 16,
outer_tag="g.1", inner_tag="l.1")
knl = lp.realize_cse(knl, "a", dtype, ["i_inner"])
knl = lp.realize_cse(knl, "b", dtype, ["j_inner"])
return knl
def variant_4(knl):
knl = lp.split_dimension(knl, "i", 256,
outer_tag="g.0", slabs=(0, -1))
knl = lp.split_dimension(knl, "j", 256,
outer_tag="g.1", slabs=(0, -1))
knl = lp.realize_cse(knl, "a", dtype, ["i_inner"])
knl = lp.realize_cse(knl, "b", dtype, ["j_inner"])
knl = lp.split_dimension(knl, "i_inner", 16,
inner_tag="l.0")
knl = lp.split_dimension(knl, "j_inner", 16,
inner_tag="l.1")
knl = lp.split_dimension(knl, "j_inner_0", 16,
outer_tag="l.1", inner_tag="l.0")
knl = lp.split_dimension(knl, "i_inner_0", 16,
outer_tag="l.1", inner_tag="l.0")
return knl
#for variant in [variant_1, variant_2, variant_3]:
for variant in [variant_4]:
kernel_gen = lp.generate_loop_schedules(variant(knl))
kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5)
a = cl_random.rand(queue, n, dtype=dtype)
b = cl_random.rand(queue, n, dtype=dtype)
refsol = a.get()[:, np.newaxis] * b.get()
c = cl_array.empty(queue, refsol.shape, refsol.dtype)
def launcher(kernel, gsize, lsize, check):
evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n,
g_times_l=True)
if check:
check_error(refsol, c.get())
return evt
lp.drive_timing_run(kernel_gen, queue, launcher, n**2)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment