diff --git a/loopy/__init__.py b/loopy/__init__.py index 8633d71defa850293b2e6c8a1dfd80b812275800..e1745099d236a0a4fd94b04d33b973dda3acc731 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -388,8 +388,7 @@ def tag_dimensions(kernel, iname_to_tag, force=False): iname_to_tag = dict((iname, parse_tag(tag)) for iname, tag in iname_to_tag.iteritems()) - from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase, - LocalIndexTag) + from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase) new_iname_to_tag = kernel.iname_to_tag.copy() for iname, new_tag in iname_to_tag.iteritems(): @@ -397,9 +396,7 @@ def tag_dimensions(kernel, iname_to_tag, force=False): retag_ok = False - if (isinstance(old_tag, AutoLocalIndexTagBase) - and (new_tag is None - or isinstance(new_tag, LocalIndexTag))): + if isinstance(old_tag, AutoLocalIndexTagBase): retag_ok = True if not retag_ok and old_tag is not None and new_tag is None: diff --git a/loopy/cse.py b/loopy/cse.py index b16983f5e3bbf8eeddcc2d7c84542065131b5c54..db1da2d19ae46f19b1b2299573d3bae490a2e44a 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -476,8 +476,8 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], if new_storage_axis_names is not None and i < len(new_storage_axis_names): name = new_storage_axis_names[i] tag_lookup_saxis = name - if new_name in (kernel.all_variable_names() | newly_created_var_names): - raise RuntimeError("new storage axis name '%s' already exists" % new_name) + if name in (kernel.all_variable_names() | newly_created_var_names): + raise RuntimeError("new storage axis name '%s' already exists" % name) if name in (kernel.all_variable_names() | newly_created_var_names): @@ -488,7 +488,7 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], storage_axis_name_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) - newly_created_var_names.add(new_name) + newly_created_var_names.add(name) expr_subst_dict[old_name] = var(name) del storage_axis_to_tag diff --git a/test/test_nbody.py b/test/test_nbody.py index 0f9db5f724a46f3ff53a6274b4a4ed04ca68469a..fb2cee8c26e8585c3ab05e9d855a4807805a2d53 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -40,6 +40,7 @@ def test_nbody(ctx_factory): def variant_cpu(knl): knl = lp.split_dimension(knl, "i", 1024, outer_tag="g.0", slabs=(0,1)) + knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) return knl, [] def variant_gpu(knl): @@ -47,10 +48,12 @@ def test_nbody(ctx_factory): outer_tag="g.0", inner_tag="l.0", slabs=(0,1)) knl = lp.split_dimension(knl, "j", 256, slabs=(0,1)) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) - knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"]) + knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], + ["x_fetch_j", "x_fetch_k"]) + knl = lp.tag_dimensions(knl, dict(x_fetch_k="unr")) return knl, ["j_outer", "j_inner"] - n = 100 + n = 3000 for variant in [variant_gpu]: variant_knl, loop_prio = variant(knl) @@ -59,7 +62,7 @@ def test_nbody(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, dict(N=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, - op_count=4*n**2*1e-9, op_label="GOps/s", + op_count=n**2*1e-6, op_label="M particle pairs", parameters={"N": n}, print_ref_code=True)