diff --git a/MEMO b/MEMO index 3914782d73167efc53ccbae5a8de12352d7de25e..225dfffd985865d135617e890cdbdde8d4cfc102 100644 --- a/MEMO +++ b/MEMO @@ -42,13 +42,10 @@ Things to consider To-do ^^^^^ -- variable shuffle detection - -> will need unification - - Automatically generate testing code vs. sequential. - For forced workgroup sizes: check that at least one iname - maps to it. + maps to them. - If isl can prove that all operands are positive, may use '/' instead of 'floor_div'. @@ -95,6 +92,9 @@ Future ideas Dealt with ^^^^^^^^^^ +- variable shuffle detection + -> will need unification + - Dimension joining - user interface for dim length prescription diff --git a/loopy/__init__.py b/loopy/__init__.py index 4eb0e2c55bfd7a36c42edfda26438d407f3527c3..3b80b4689d5e80c340e3a25b74722331ccad7de2 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -235,9 +235,9 @@ def make_kernel(*args, **kwargs): def split_dimension(kernel, iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, - slabs=(0, 0)): + slabs=(0, 0), do_tagged_check=True): - if kernel.iname_to_tag.get(iname) is not None: + if do_tagged_check and kernel.iname_to_tag.get(iname) is not None: raise RuntimeError("cannot split already tagged iname '%s'" % iname) if iname not in kernel.all_inames(): diff --git a/loopy/cse.py b/loopy/cse.py index d169d0d5bdb9534b9d6cb542778811f5934b9232..6a1ab747e8556eeab38e10cc1e6e45ca8b5c4647 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -295,7 +295,7 @@ def process_cses(kernel, lead_csed, cse_descriptors): def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local, - new_inames, ind_iname_to_tag): + independent_inames, new_inames, ind_iname_to_tag): insn = lead_csed.insn # {{{ decide whether to force a dep @@ -310,13 +310,13 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local, assert dependencies <= parent_inames for iname in parent_inames: - if iname in lead_csed.independent_inames: + if iname in independent_inames: tag = ind_iname_to_tag[iname] else: tag = kernel.iname_to_tag.get(iname) if should_cse_force_iname_dep( - iname, lead_csed.independent_inames, tag, dependencies, + iname, independent_inames, tag, dependencies, target_var_is_local, lead_csed.cse): forced_iname_deps.add(iname) @@ -324,7 +324,7 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local, assignee = var(target_var_name) - if lead_csed.independent_inames: + if new_inames: assignee = assignee[tuple( var(iname) for iname in new_inames )] @@ -334,8 +334,7 @@ def make_compute_insn(kernel, lead_csed, target_var_name, target_var_is_local, subst_map = SubstitutionMapper(make_subst_func( dict( (old_iname, var(new_iname)) - for old_iname, new_iname in zip(lead_csed.independent_inames, - new_inames)))) + for old_iname, new_iname in zip(independent_inames, new_inames)))) new_inner_expr = subst_map(lead_csed.cse.child) insn_prefix = lead_csed.cse.prefix @@ -483,7 +482,7 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], compute_insn = make_compute_insn( kernel, lead_csed, target_var_name, target_var_is_local, - new_inames, ind_iname_to_tag) + independent_inames, new_inames, ind_iname_to_tag) # {{{ substitute variable references into instructions @@ -493,12 +492,16 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], lead_indices = [var(iname) for iname in independent_inames] else: + found = False for csed in cse_descriptors: if cse is csed.cse: + found = True break - if cse is not csed.cse: - return rec(cse.child) + if not found: + from pymbolic.primitives import CommonSubexpression + return CommonSubexpression( + rec(cse.child), cse.prefix) lead_indices = csed.lead_index_exprs diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d7d02effe36ccb29720bb1388922105f748b39a4..2bed109f204ce3668444c3d1a7de2a2bbb1fc2b1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -359,7 +359,8 @@ def assign_automatic_axes(kernel, only_axis_0=True): from loopy import split_dimension return assign_automatic_axes( split_dimension(kernel, iname, inner_length=local_size[axis], - outer_tag=UnrollTag(), inner_tag=new_tag), + outer_tag=UnrollTag(), inner_tag=new_tag, + do_tagged_check=False), only_axis_0=only_axis_0) new_iname_to_tag = kernel.iname_to_tag.copy() diff --git a/test/test_linalg.py b/test/test_linalg.py index 23370ced982b89f4e02d7b58d9db025c64bbf661..f5e88286a9ee8feb2868423ccb41382bbc8d52d8 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -175,7 +175,6 @@ def test_transpose(ctx_factory): knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, {})