diff --git a/MEMO b/MEMO index facaab5a845f999aadc7c4e2905ab776707cb6f8..ea2317ab2b8c5f8b4dc46dd1ed39c86d56c4c142 100644 --- a/MEMO +++ b/MEMO @@ -56,7 +56,8 @@ Things to consider - Parallel dimension splitting/merging via tags -> unnecessary? -- All user-supplied commands are assumed to be idempotent. +- Not using all hw loop dimensions causes an error, as + is the case for variant 3 in the rank_one test. TODO ^^^^ @@ -83,6 +84,9 @@ TODO - Better for loop bound generation -> Try a triangular loop +- Nested slab decomposition (in conjunction with conditional hoisting) could + generate nested conditional code. + Dealt with ^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 784bc99cf1d900e177e6d612c2e5d1338d6d9c75..86c0986c026616bdef457e7a351876aa319d5507 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -38,14 +38,11 @@ from loopy.compiled import CompiledKernel, drive_timing_run def split_dimension(kernel, iname, inner_length, padded_length=None, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, - outer_slab_increments=(0, -1), no_slabs=None): + slabs=(0, 0)): if iname not in kernel.all_inames(): raise ValueError("cannot split loop for unknown variable '%s'" % iname) - if no_slabs: - outer_slab_increments = (0, 0) - if padded_length is not None: inner_tag = inner_tag.copy(forced_length=padded_length) @@ -115,7 +112,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, # }}} iname_slab_increments = kernel.iname_slab_increments.copy() - iname_slab_increments[outer_iname] = outer_slab_increments + iname_slab_increments[outer_iname] = slabs result = (kernel .copy(domain=new_domain, iname_slab_increments=iname_slab_increments, @@ -321,8 +318,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non id=kernel.make_unique_instruction_id(based_on=cse_tag), assignee=assignee, expression=new_inner_expr, - forced_iname_deps=forced_iname_deps, - idempotent=True) + forced_iname_deps=forced_iname_deps) cse_result_insns.append(new_insn) diff --git a/loopy/kernel.py b/loopy/kernel.py index 0d9ea7cf05f35565697f9d10e780709ef7f70d26..d9db977807448348c4656342df79390e2f4e2f8e 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -224,10 +224,8 @@ class Instruction(Record): dependencies) without changing the meaning of the program. """ def __init__(self, - id, assignee, expression, idempotent, - forced_iname_deps=[], insn_deps=[]): - - assert isinstance(idempotent, bool) + id, assignee, expression, + forced_iname_deps=[], insn_deps=[], idempotent=None): Record.__init__(self, id=id, assignee=assignee, expression=expression, @@ -258,6 +256,15 @@ class Instruction(Record): result = "%s: %s <- %s\n [%s]" % (self.id, self.assignee, self.expression, ", ".join(sorted(self.all_inames()))) + if self.idempotent == True: + result += " (idempotent)" + elif self.idempotent == False: + result += " (not idempotent)" + elif self.idempotent is None: + result += " (idempotence unknown)" + else: + raise RuntimeError("unexpected value for Instruction.idempotent") + if self.insn_deps: result += "\n : " + ", ".join(self.insn_deps) @@ -450,8 +457,7 @@ class LoopKernel(Record): id=self.make_unique_instruction_id(insns, based_on=label), insn_deps=insn_deps, forced_iname_deps=forced_iname_deps, - assignee=lhs, expression=rhs, - idempotent=True) + assignee=lhs, expression=rhs) if isinstance(domain, str): ctx = isl.Context() diff --git a/loopy/schedule.py b/loopy/schedule.py index 3d7a329cede167ff0c2e8c4f9ace40aa8bc2aa38..cf146782bf59945a857673aef566cfee67918f40 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -58,8 +58,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, forced_iname_deps=list(insn.all_inames() - set(expr.inames)), - expression=expr.operation.neutral_element, - idempotent=True) + expression=expr.operation.neutral_element) new_insns.append(init_insn) @@ -69,8 +68,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None): assignee=target_var, expression=expr.operation(target_var, sub_expr), insn_deps=[init_insn.id], - forced_iname_deps=list(insn.all_inames()), - idempotent=False) + forced_iname_deps=list(insn.all_inames())) new_insns.append(reduction_insn) @@ -210,10 +208,10 @@ def check_for_unused_hw_axes(kernel): raise RuntimeError("auto local tag encountered") if group_axes != group_axes_used: - raise RuntimeError("instruction '%s' does not use all hw group axes" + raise RuntimeError("instruction '%s' does not use all group hw axes" % insn.id) if local_axes != local_axes_used: - raise RuntimeError("instruction '%s' does not use all hw local axes" + raise RuntimeError("instruction '%s' does not use all local hw axes" % insn.id) @@ -305,53 +303,65 @@ def adjust_local_temp_var_storage(kernel): # }}} -# {{{ automatic dependencies +# {{{ automatic dependencies, find idempotent instructions -def find_writers(kernel): +def find_accessors(kernel, readers): """ :return: a dict that maps variable names to ids of insns that write to that variable. """ - writer_insn_ids = {} + result = {} - admissible_write_vars = ( + admissible_vars = ( set(arg.name for arg in kernel.args) | set(kernel.temporary_variables.iterkeys())) for insn in kernel.instructions: - var_name = insn.get_assignee_var_name() - - if var_name not in admissible_write_vars: - raise RuntimeError("writing to '%s' is not allowed" % var_name) + if readers: + from loopy.symbolic import DependencyMapper + var_names = DependencyMapper()(insn.expression) & admissible_vars + else: + var_name = insn.get_assignee_var_name() - writer_insn_ids.setdefault(var_name, set()).add(insn.id) + if var_name not in admissible_vars: + raise RuntimeError("writing to '%s' is not allowed" % var_name) + var_names = [var_name] - return writer_insn_ids + for var_name in var_names: + result.setdefault(var_name, set()).add(insn.id) + return result -def add_automatic_dependencies(kernel): - writer_map = find_writers(kernel) +def add_idempotence_and_automatic_dependencies(kernel): + writer_map = find_accessors(kernel, readers=False) arg_names = set(arg.name for arg in kernel.args) var_names = arg_names | set(kernel.temporary_variables.iterkeys()) from loopy.symbolic import DependencyMapper - dep_map = DependencyMapper(composite_leaves=False) - new_insns = [] + dm = DependencyMapper(composite_leaves=False) + dep_map = {} + for insn in kernel.instructions: - read_vars = ( - set(var.name for var in dep_map(insn.expression)) + dep_map[insn.id] = ( + set(var.name for var in dm(insn.expression)) & var_names) + new_insns = [] + for insn in kernel.instructions: auto_deps = [] - for var in read_vars: + + # {{{ add automatic dependencies + all_my_var_writers = set() + for var in dep_map[insn.id]: var_writers = writer_map.get(var, set()) + all_my_var_writers |= var_writers - if not var_writers and var not in var_names: + if not var_writers and var not in arg_names: from warnings import warn warn("'%s' is read, but never written." % var) @@ -365,9 +375,26 @@ def add_automatic_dependencies(kernel): if len(var_writers) == 1: auto_deps.extend(var_writers) + # }}} + + # {{{ find dependency loops, flag idempotence + + while True: + last_all_my_var_writers = all_my_var_writers + + for writer_insn_id in last_all_my_var_writers: + for var in dep_map[writer_insn_id]: + all_my_var_writers = all_my_var_writers | writer_map.get(var, set()) + + if last_all_my_var_writers == all_my_var_writers: + break + + # }}} + new_insns.append( insn.copy( - insn_deps=insn.insn_deps + auto_deps)) + insn_deps=insn.insn_deps + auto_deps, + idempotent=insn.id not in all_my_var_writers)) return kernel.copy(instructions=new_insns) @@ -514,7 +541,7 @@ def assign_automatic_axes(kernel, only_axis_0=True): from loopy import split_dimension return assign_automatic_axes( split_dimension(kernel, iname, inner_length=local_size[axis], - outer_tag=UnrollTag(), inner_tag=new_tag, no_slabs=True), + outer_tag=UnrollTag(), inner_tag=new_tag), only_axis_0=only_axis_0) new_iname_to_tag = kernel.iname_to_tag.copy() @@ -613,7 +640,7 @@ def generate_loop_schedules_internal(kernel, schedule=[]): for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] - if insn.idempotent: + if insn.idempotent == True: # If insn is idempotent, it may be placed inside a more deeply # nested loop without harm. @@ -621,7 +648,8 @@ def generate_loop_schedules_internal(kernel, schedule=[]): insn.all_inames() - parallel_inames <= active_inames - parallel_inames) - else: + + elif insn.idempotent == False: # If insn is not idempotent, we must insist that it is placed inside # the exactly correct set of loops. @@ -630,6 +658,10 @@ def generate_loop_schedules_internal(kernel, schedule=[]): == active_inames - parallel_inames) + else: + raise RuntimeError("instruction '%s' has undetermined idempotence" + % insn.id) + if (iname_deps_satisfied and set(insn.insn_deps) <= scheduled_insn_ids): scheduled_insn_ids.add(insn.id) @@ -782,7 +814,7 @@ def insert_barriers(kernel, schedule, level=0): # {{{ issue dependency-based barriers for this instruction - if insn.id in owed_barriers: + if set(insn.insn_deps) & owed_barriers: issue_barrier(is_pre_barrier=False) # }}} @@ -827,7 +859,7 @@ def generate_loop_schedules(kernel): # }}} kernel = assign_automatic_axes(kernel) - kernel = add_automatic_dependencies(kernel) + kernel = add_idempotence_and_automatic_dependencies(kernel) kernel = adjust_local_temp_var_storage(kernel) check_for_double_use_of_hw_axes(kernel) diff --git a/test/test_matmul.py b/test/test_matmul.py index c6797c6bc3b654ce13e8bfa2067d586240901871..7cb8ac4e709ce425fd0c3e50e6ec31d3a60a728a 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -1,3 +1,5 @@ +from __future__ import division + import numpy as np import numpy.linalg as la import pyopencl as cl @@ -214,16 +216,16 @@ def test_plain_matrix_mul_new_ui(ctx_factory): name="matmul", assumptions="n >= 16") knl = lp.split_dimension(knl, "i", 16, - outer_tag="g.0", inner_tag="l.1", no_slabs=True) + outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 8, - outer_tag="g.1", inner_tag="l.0", no_slabs=True) - knl = lp.split_dimension(knl, "k", 32, no_slabs=True) + outer_tag="g.1", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 32) knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"]) knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -251,8 +253,7 @@ def test_rank_one(ctx_factory): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = int(get_suitable_size(ctx)**(3/2)) - print n + n = int(get_suitable_size(ctx)**(2.7/2)) knl = lp.LoopKernel(ctx.devices[0], "[n] -> {[i,j]: 0<=i,j<n}", @@ -267,33 +268,71 @@ def test_rank_one(ctx_factory): ], name="rank_one", assumptions="n >= 16") - #knl = lp.split_dimension(knl, "i", 16, - #outer_tag="g.0", inner_tag="l.1", no_slabs=True) - #knl = lp.split_dimension(knl, "j", 8, - #outer_tag="g.1", inner_tag="l.0", no_slabs=True) - #knl = lp.split_dimension(knl, "k", 32, no_slabs=True) - - knl = lp.realize_cse(knl, "a", dtype)#, ["i_inner"]) - knl = lp.realize_cse(knl, "b", dtype)#, ["j_inner"]) - - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=6) - - a = cl_random.rand(queue, n, dtype=dtype) - b = cl_random.rand(queue, n, dtype=dtype) - refsol = a.get()[:, np.newaxis] * b.get() - c = cl_array.empty(queue, refsol.shape, refsol.dtype) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, n**2) + def variant_1(knl): + knl = lp.realize_cse(knl, "a", dtype) + knl = lp.realize_cse(knl, "b", dtype) + return knl + + def variant_2(knl): + knl = lp.split_dimension(knl, "i", 16, + outer_tag="g.0", inner_tag="l.0") + knl = lp.split_dimension(knl, "j", 16, + outer_tag="g.1", inner_tag="l.1") + + knl = lp.realize_cse(knl, "a", dtype) + knl = lp.realize_cse(knl, "b", dtype) + return knl + + def variant_3(knl): + knl = lp.split_dimension(knl, "i", 16, + outer_tag="g.0", inner_tag="l.0") + knl = lp.split_dimension(knl, "j", 16, + outer_tag="g.1", inner_tag="l.1") + + knl = lp.realize_cse(knl, "a", dtype, ["i_inner"]) + knl = lp.realize_cse(knl, "b", dtype, ["j_inner"]) + return knl + + def variant_4(knl): + knl = lp.split_dimension(knl, "i", 256, + outer_tag="g.0", slabs=(0, -1)) + knl = lp.split_dimension(knl, "j", 256, + outer_tag="g.1", slabs=(0, -1)) + + knl = lp.realize_cse(knl, "a", dtype, ["i_inner"]) + knl = lp.realize_cse(knl, "b", dtype, ["j_inner"]) + + knl = lp.split_dimension(knl, "i_inner", 16, + inner_tag="l.0") + knl = lp.split_dimension(knl, "j_inner", 16, + inner_tag="l.1") + + knl = lp.split_dimension(knl, "j_inner_0", 16, + outer_tag="l.1", inner_tag="l.0") + knl = lp.split_dimension(knl, "i_inner_0", 16, + outer_tag="l.1", inner_tag="l.0") + return knl + + #for variant in [variant_1, variant_2, variant_3]: + for variant in [variant_4]: + kernel_gen = lp.generate_loop_schedules(variant(knl)) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + + a = cl_random.rand(queue, n, dtype=dtype) + b = cl_random.rand(queue, n, dtype=dtype) + refsol = a.get()[:, np.newaxis] * b.get() + c = cl_array.empty(queue, refsol.shape, refsol.dtype) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, n**2)