diff --git a/MEMO b/MEMO index ea2317ab2b8c5f8b4dc46dd1ed39c86d56c4c142..589e672b5427643c07c0dddca61601b9dafdf5a8 100644 --- a/MEMO +++ b/MEMO @@ -69,8 +69,6 @@ TODO a <- cse(reduce(stuff)) -- reimplement add_prefetch - - user interface for dim length prescription - How to determine which variables need to be duplicated for ILP? @@ -90,6 +88,8 @@ TODO Dealt with ^^^^^^^^^^ +- reimplement add_prefetch + - Flag, exploit idempotence - Some things involving CSEs might be impossible to schedule diff --git a/loopy/__init__.py b/loopy/__init__.py index 86c0986c026616bdef457e7a351876aa319d5507..5f62ea7d30a90595848ce00b831d5a224679ae09 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -266,9 +266,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # the iname is *not* a dependency of the fetch expression if iname in duplicate_inames: - raise RuntimeError("duplicating an iname " - "that the CSE does not depend on " - "does not make sense") + raise RuntimeError("duplicating an iname ('%s')" + "that the CSE ('%s') does not depend on " + "does not make sense" % (iname, expr.child)) # Which iname dependencies are carried over from CSE host # to the CSE compute instruction? @@ -495,72 +495,39 @@ def check_kernels(kernel_gen, parameters, kill_level_min=3, # }}} -# {{{ high-level modifiers +# {{{ convenience -def get_input_access_descriptors(kernel): - """Return a dictionary mapping input vectors to - a list of input access descriptor. An input access - descriptor is a tuple (input_vec, index_expr). - """ - 1/0 # broken - - from loopy.symbolic import VariableIndexExpressionCollector - - from pytools import flatten - result = {} - for ivec in kernel.input_vectors(): - result[ivec] = set( - (ivec, iexpr) - for iexpr in flatten( - VariableIndexExpressionCollector(ivec)(expression) - for lvalue, expression in kernel.instructions - )) - - return result - -def add_prefetch(kernel, input_access_descr, fetch_dims, loc_fetch_axes={}): - """ - :arg input_access_descr: see :func:`get_input_access_descriptors`. - May also be the name of the variable if there is only one - reference to that variable. - :arg fetch_dims: loop dimensions indexing the input variable on which - the prefetch is to be carried out. - """ - 1/0 # broken - - if isinstance(input_access_descr, str): - var_name = input_access_descr - var_iads = get_input_access_descriptors(kernel)[var_name] - - if len(var_iads) != 1: - raise ValueError("input access descriptor for variable %s is " - "not unique" % var_name) - - input_access_descr, = var_iads - - def parse_fetch_dim(iname): - if isinstance(iname, str): - iname = (iname,) - - return tuple(kernel.tag_or_iname_to_iname(s) for s in iname) - - fetch_dims = [parse_fetch_dim(fd) for fd in fetch_dims] - ivec, iexpr = input_access_descr - - new_prefetch = getattr(kernel, "prefetch", {}).copy() - if input_access_descr in new_prefetch: - raise ValueError("a prefetch descriptor for the input access %s[%s] " - "already exists" % (ivec, iexpr)) - - from loopy.prefetch import LocalMemoryPrefetch - new_prefetch[input_access_descr] = LocalMemoryPrefetch( - kernel=kernel, - input_vector=ivec, - index_expr=iexpr, - fetch_dims=fetch_dims, - loc_fetch_axes=loc_fetch_axes) - - return kernel.copy(prefetch=new_prefetch) +def add_prefetch(kernel, var_name, fetch_dims=[]): + used_cse_tags = set() + def map_cse(expr, rec): + used_cse_tags.add(expr.tag) + rec(expr.child) + + new_cse_tags = set() + + def get_unique_cse_tag(): + from loopy.tools import generate_unique_possibilities + for cse_tag in generate_unique_possibilities(prefix="fetch_"+var_name): + if cse_tag not in used_cse_tags: + used_cse_tags.add(cse_tag) + new_cse_tags.add(cse_tag) + return cse_tag + + from loopy.symbolic import VariableFetchCSEMapper + vf_cse_mapper = VariableFetchCSEMapper(var_name, get_unique_cse_tag) + kernel = kernel.copy(instructions=[ + insn.copy(expression=vf_cse_mapper(insn.expression)) + for insn in kernel.instructions]) + + if var_name in kernel.arg_dict: + dtype = kernel.arg_dict[var_name].dtype + else: + dtype = kernel.temporary_variables[var_name].dtype + + for cse_tag in new_cse_tags: + kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims) + + return kernel # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 24ab2e93442ac3b03bd59ba32fcf5e4f6041c407..b0c621539aa008cd6c72d7431883716b7c993d74 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -506,6 +506,30 @@ class IndexVariableFinder(CombineMapper): # }}} +# {{{ variable-fetch CSE mapper + +class VariableFetchCSEMapper(IdentityMapper): + def __init__(self, var_name, cse_tag_getter): + self.var_name = var_name + self.cse_tag_getter = cse_tag_getter + + def map_variable(self, expr): + from pymbolic.primitives import CommonSubexpression + if expr.name == self.var_name: + return CommonSubexpression(expr, self.cse_tag_getter()) + else: + return IdentityMapper.map_variable(self, expr) + + def map_subscript(self, expr): + from pymbolic.primitives import CommonSubexpression, Variable, Subscript + if (isinstance(expr.aggregate, Variable) + and expr.aggregate.name == self.var_name): + return CommonSubexpression( + Subscript(expr.aggregate, self.rec(expr.index)), self.cse_tag_getter()) + else: + return IdentityMapper.map_subscript(self, expr) + +# }}} diff --git a/test/test_matmul.py b/test/test_matmul.py index 7cb8ac4e709ce425fd0c3e50e6ec31d3a60a728a..1eb51cc5a3b62c43df709e09bbf44bab49d8665c 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -258,7 +258,7 @@ def test_rank_one(ctx_factory): knl = lp.LoopKernel(ctx.devices[0], "[n] -> {[i,j]: 0<=i,j<n}", [ - "label: c[i, j] = cse(a[i], a)*cse(b[j], b)" + "label: c[i, j] = a[i]*b[j]" ], [ lp.ArrayArg("a", dtype, shape=(n,), order=order), @@ -269,8 +269,8 @@ def test_rank_one(ctx_factory): name="rank_one", assumptions="n >= 16") def variant_1(knl): - knl = lp.realize_cse(knl, "a", dtype) - knl = lp.realize_cse(knl, "b", dtype) + knl = lp.add_prefetch(knl, "a") + knl = lp.add_prefetch(knl, "b") return knl def variant_2(knl): @@ -279,8 +279,8 @@ def test_rank_one(ctx_factory): knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.realize_cse(knl, "a", dtype) - knl = lp.realize_cse(knl, "b", dtype) + knl = lp.add_prefetch(knl, "a") + knl = lp.add_prefetch(knl, "b") return knl def variant_3(knl): @@ -289,8 +289,8 @@ def test_rank_one(ctx_factory): knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.realize_cse(knl, "a", dtype, ["i_inner"]) - knl = lp.realize_cse(knl, "b", dtype, ["j_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner"]) + knl = lp.add_prefetch(knl, "b", ["j_inner"]) return knl def variant_4(knl): @@ -299,8 +299,8 @@ def test_rank_one(ctx_factory): knl = lp.split_dimension(knl, "j", 256, outer_tag="g.1", slabs=(0, -1)) - knl = lp.realize_cse(knl, "a", dtype, ["i_inner"]) - knl = lp.realize_cse(knl, "b", dtype, ["j_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner"]) + knl = lp.add_prefetch(knl, "b", ["j_inner"]) knl = lp.split_dimension(knl, "i_inner", 16, inner_tag="l.0")