diff --git a/MEMO b/MEMO index 751f9769df87ed194078f4dc4e6c206c7b2cd198..10448216275060abadc24c1e2ff3a5a7c21d7ae8 100644 --- a/MEMO +++ b/MEMO @@ -43,10 +43,21 @@ To-do ^^^^^ - variable shuffle detection + -> will need unification + +- Fix all tests + +- Automatically generate testing code vs. sequential. - Deal with equality constraints. (These arise, e.g., when partitioning a loop of length 16 into 16s.) +- duplicate_dimensions can be implemented without having to muck around + with individual constraints: + - add_dims + - move_dims + - intersect + Future ideas ^^^^^^^^^^^^ @@ -84,6 +95,8 @@ Future ideas Dealt with ^^^^^^^^^^ +- Dimension joining + - user interface for dim length prescription - Restrict-to-sequential and tagging have nothing to do with each other. diff --git a/loopy/__init__.py b/loopy/__init__.py index aab4f1083774369b9347b227545c217e56eac33e..ae0a6d4ae4a14fde1ba9520d9febba4e28a3fcff 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -161,8 +161,8 @@ def make_kernel(*args, **kwargs): insn = insn.copy( assignee=subst_map(insn.assignee), expression=new_expression, - forced_iname_deps=[ - old_to_new.get(iname, iname) for iname in insn.forced_iname_deps], + forced_iname_deps=set( + old_to_new.get(iname, iname) for iname in insn.forced_iname_deps), ) # }}} @@ -224,8 +224,7 @@ def make_kernel(*args, **kwargs): # {{{ user-facing kernel manipulation functionality - -def split_dimension(kernel, iname, inner_length, padded_length=None, +def split_dimension(kernel, iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0)): @@ -233,9 +232,6 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, if iname not in kernel.all_inames(): raise ValueError("cannot split loop for unknown variable '%s'" % iname) - if padded_length is not None: - inner_tag = inner_tag.copy(forced_length=padded_length) - if outer_iname is None: outer_iname = iname+"_outer" if inner_iname is None: @@ -286,9 +282,9 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, new_expr = subst_mapper(rls(insn.expression)) if iname in insn.forced_iname_deps: - new_forced_iname_deps = insn.forced_iname_deps[:] + new_forced_iname_deps = insn.forced_iname_deps.copy() new_forced_iname_deps.remove(iname) - new_forced_iname_deps.extend([outer_iname, inner_iname]) + new_forced_iname_deps.update([outer_iname, inner_iname]) else: new_forced_iname_deps = insn.forced_iname_deps @@ -307,7 +303,6 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, result = (kernel .copy(domain=new_domain, iname_slab_increments=iname_slab_increments, - iname_to_dim=None, instructions=new_insns)) return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag}) @@ -390,6 +385,10 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non if new_inames is None: new_inames = [None] * len(duplicate_inames) + if len(new_inames) != len(duplicate_inames): + raise ValueError("If given, the new_inames argument must have the " + "same length as duplicate_inames") + temp_new_inames = [] for old_iname, new_iname in zip(duplicate_inames, new_inames): if new_iname is None: @@ -431,7 +430,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # {{{ decide what to do with each iname - forced_iname_deps = [] + forced_iname_deps = set() from loopy.symbolic import IndexVariableFinder dependencies = IndexVariableFinder( @@ -507,7 +506,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non force_dependency = kind == "g" if force_dependency: - forced_iname_deps.append(iname) + forced_iname_deps.add(iname) # }}} @@ -604,7 +603,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # {{{ convenience -def add_prefetch(kernel, var_name, fetch_dims=[]): +def add_prefetch(kernel, var_name, fetch_dims=[], new_inames=None): used_cse_tags = set() def map_cse(expr, rec): used_cse_tags.add(expr.tag) @@ -632,7 +631,8 @@ def add_prefetch(kernel, var_name, fetch_dims=[]): dtype = kernel.temporary_variables[var_name].dtype for cse_tag in new_cse_tags: - kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims) + kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims, + new_inames=new_inames) return kernel diff --git a/loopy/check.py b/loopy/check.py index 3717084dd9610024676d36d89c20648bf8fb1801..cb20950bce4d7ef0c099100f3b0f018ca3b46d41 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -210,6 +210,15 @@ def check_implemented_domains(kernel, implemented_domains): # }}} +def run_automatic_checks(kernel): + import loopy.check as chk + + chk.check_for_double_use_of_hw_axes(kernel) + chk.check_for_unused_hw_axes(kernel) + chk.check_for_inactive_iname_access(kernel) + chk.check_for_write_races(kernel) + + # {{{ user-invoked checks def get_problems(kernel, parameters): diff --git a/loopy/kernel.py b/loopy/kernel.py index 4c31a30c05d73407553575182850131912c535c8..25d1ca77dbc703ca9b145bc0bceac17466f19d4c 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -214,7 +214,7 @@ class Instruction(Record): a :class:`LoopKernel`. :ivar assignee: :ivar expression: - :ivar forced_iname_deps: a list of inames that are added to the list of iname + :ivar forced_iname_deps: a set of inames that are added to the list of iname dependencies :ivar insn_deps: a list of ids of :class:`Instruction` instances that *must* be executed before this one. Note that loop scheduling augments this @@ -233,9 +233,12 @@ class Instruction(Record): """ def __init__(self, id, assignee, expression, - forced_iname_deps=[], insn_deps=[], boostable=None, + forced_iname_deps=set(), insn_deps=set(), boostable=None, temp_var_type=None, duplicate_inames_and_tags=[]): + assert isinstance(forced_iname_deps, set) + assert isinstance(insn_deps, set) + Record.__init__(self, id=id, assignee=assignee, expression=expression, forced_iname_deps=forced_iname_deps, @@ -452,7 +455,7 @@ class LoopKernel(Record): preamble=None, assumptions=None, iname_slab_increments={}, temporary_variables={}, - local_sizes=None, + local_sizes={}, iname_to_tag={}, iname_to_tag_requests=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. @@ -523,17 +526,17 @@ class LoopKernel(Record): else: label = "insn" if groups["insn_deps"] is not None: - insn_deps = [dep.strip() for dep in groups["insn_deps"].split(",")] + insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(",")) else: - insn_deps = [] + insn_deps = set() if groups["iname_deps_and_tags"] is not None: inames_and_tags = parse_iname_and_tag_list( groups["iname_deps_and_tags"]) - forced_iname_deps = [iname for iname, tag in inames_and_tags] + forced_iname_deps = set(iname for iname, tag in inames_and_tags) iname_to_tag_requests.update(dict(inames_and_tags)) else: - forced_iname_deps = [] + forced_iname_deps = set() if groups["duplicate_inames_and_tags"] is not None: duplicate_inames_and_tags = parse_iname_and_tag_list( @@ -566,7 +569,6 @@ class LoopKernel(Record): if len(set(insn.id for insn in insns)) != len(insns): raise RuntimeError("instruction ids do not appear to be unique") - if assumptions is None: assumptions_space = domain.get_space().params() assumptions = isl.Set.universe(assumptions_space) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a06e28b4eff24e6451281e3b70c3a86f7697e59a..d7d02effe36ccb29720bb1388922105f748b39a4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -57,7 +57,7 @@ def realize_reduction(kernel): id=kernel.make_unique_instruction_id( extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, - forced_iname_deps=list(insn.all_inames() - set(expr.inames)), + forced_iname_deps=insn.all_inames() - set(expr.inames), expression=expr.operation.neutral_element) new_insns.append(init_insn) @@ -67,12 +67,12 @@ def realize_reduction(kernel): extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, expression=expr.operation(target_var, sub_expr), - insn_deps=[init_insn.id], - forced_iname_deps=list(insn.all_inames() | set(expr.inames))) + insn_deps=set([init_insn.id]) | insn.insn_deps, + forced_iname_deps=insn.all_inames() | set(expr.inames)) new_insns.append(reduction_insn) - new_insn_insn_deps.append(reduction_insn.id) + new_insn_insn_deps.add(reduction_insn.id) return target_var @@ -80,15 +80,15 @@ def realize_reduction(kernel): cb_mapper = ReductionCallbackMapper(map_reduction) for insn in kernel.instructions: - new_insn_insn_deps = [] + new_insn_insn_deps = set() new_expression = cb_mapper(insn.expression) new_insn = insn.copy( expression=new_expression, insn_deps=insn.insn_deps - + new_insn_insn_deps, - forced_iname_deps=list(insn.all_inames())) + | new_insn_insn_deps, + forced_iname_deps=insn.all_inames()) new_insns.append(new_insn) @@ -149,7 +149,7 @@ def add_boostability_and_automatic_dependencies(kernel): new_insns = [] for insn in kernel.instructions: - auto_deps = [] + auto_deps = set() # {{{ add automatic dependencies @@ -170,7 +170,7 @@ def add_boostability_and_automatic_dependencies(kernel): % (var, insn.id)) if len(var_writers) == 1: - auto_deps.extend(var_writers) + auto_deps.update(var_writers) # }}} @@ -195,7 +195,7 @@ def add_boostability_and_automatic_dependencies(kernel): new_insns.append( insn.copy( - insn_deps=insn.insn_deps + auto_deps, + insn_deps=insn.insn_deps | auto_deps, boostable=boostable)) # {{{ remove boostability from isns that access non-boostable vars @@ -504,13 +504,6 @@ def preprocess_kernel(kernel): kernel = add_boostability_and_automatic_dependencies(kernel) kernel = adjust_local_temp_var_storage(kernel) - import loopy.check as chk - - chk.check_for_double_use_of_hw_axes(kernel) - chk.check_for_unused_hw_axes(kernel) - chk.check_for_inactive_iname_access(kernel) - chk.check_for_write_races(kernel) - return kernel diff --git a/loopy/schedule.py b/loopy/schedule.py index 9b597e844b8731bf2c8544f0488a7aed8dc67bd0..bae8bb612c79977040cf60692bc7794b8ed574ae 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -431,6 +431,9 @@ def generate_loop_schedules(kernel, loop_priority=[]): from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) + from loopy.check import run_automatic_checks + run_automatic_checks(kernel) + schedule_count = 0 for gen_sched in generate_loop_schedules_internal(kernel, loop_priority): diff --git a/test/test_linalg.py b/test/test_linalg.py index 265353abee68cd54a6e2882d6ee67d9bb3e363ff..23370ced982b89f4e02d7b58d9db025c64bbf661 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -586,7 +586,7 @@ def test_image_matrix_mul(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = a[i, k]*b[k, j]" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, 2), @@ -633,12 +633,12 @@ def test_image_matrix_mul_ilp(ctx_factory): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = get_suitable_size(ctx) + n = 2*get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = a[i, k]*b[k, j]" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, 2), @@ -655,8 +655,12 @@ def test_image_matrix_mul_ilp(ctx_factory): knl = lp.split_dimension(knl, "k", 2) # conflict-free knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) - #knl = lp.add_prefetch(knl, 'b', [("j_inner_outer", "j_inner_inner"), "k_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"], + ["b_j_io", "b_j_ii", "b_k_i"]) + knl = lp.join_dimensions(knl, ["b_j_io", "b_j_ii"]) + + #print lp.preprocess_kernel(knl) + #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))