diff --git a/loopy/kernel.py b/loopy/kernel.py index 30af589db7089c6c7f5c0c1c6d1ca33b57447663..ed83b30c36d964839b74201bfd9c8154c7617710 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -1101,10 +1101,10 @@ class LoopKernel(Record): loop_list = ",".join(sorted(self.insn_inames(insn))) if len(loop_list) > loop_list_width: lines.append("[%s]" % loop_list) - lines.append("%s%s <- %s ... # %s" % ( + lines.append("%s%s <- %s # %s" % ( (loop_list_width+2)*" ", insn.assignee, insn.expression, insn.id)) else: - lines.append("[%s]%s%s <- %s ... # %s" % ( + lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), insn.assignee, insn.expression, insn.id)) diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 3e3a423589d20cc321b8bb1784ba34efc41174b0..f540285db30531353b49a365975d9fb8204b3c2e 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -18,7 +18,7 @@ def test_laplacian_stiffness(ctx_factory): dim = 2 Nq = 40 # num. quadrature points - Nc = 1000 # num. cells + Nc = 100 # num. cells Nb = 20 # num. basis functions # K - run-time symbolic @@ -52,7 +52,7 @@ def test_laplacian_stiffness(ctx_factory): def variant_1(knl): # no ILP across elements knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) - knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.tag_dimensions(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, 'jacInv', ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl @@ -61,32 +61,33 @@ def test_laplacian_stiffness(ctx_factory): # with ILP across elements knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") - knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.tag_dimensions(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "jacInv", ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"]) return knl def variant_3(knl): # no ILP across elements, precompute dPsiTransf + + # generates correct code--but suboptimal in a few ways. + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) - knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.precompute(knl, "dPsi", np.float32, - sweep_axes=["K_inner"]) knl = lp.add_prefetch(knl, "jacInv", - ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) - print lp.preprocess_kernel(knl) - 1/0 + ["jacInv_dim_0", "jacInv_dim_1", "q"]) + knl = lp.tag_dimensions(knl, {"i": "l.1", "j": "l.0"}) + knl = lp.precompute(knl, "dPsi", np.float32, + sweep_axes=["K_inner"], default_tag=None) return knl - #for variant in [variant_1, variant_2, variant_3]: - for variant in [variant_3]: + for variant in [variant_1, variant_2, variant_3]: + #for variant in [variant_3]: kernel_gen = lp.generate_loop_schedules(variant(knl), loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", - parameters={"Nc": Nc}, print_seq_code=True, + parameters={"Nc": Nc}, print_ref_code=True, timing_rounds=30)