From de8e84299100bb0d28f1b4dcd111ab6805332ac6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Fri, 11 Nov 2011 12:00:57 -0500 Subject: [PATCH] Improvements to FEM assembly. --- loopy/cse.py | 3 +- loopy/schedule.py | 5 +- test/test_fem_assembly.py | 103 ++++++++++++++++++++++++++++++++++---- 3 files changed, 98 insertions(+), 13 deletions(-) diff --git a/loopy/cse.py b/loopy/cse.py index f2086b2c2..124f462b9 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -443,7 +443,8 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], sub_map = SubstitutionCallbackMapper([subst_name], do_substs) for insn in kernel.instructions: - new_insns.append(insn.copy(expression=sub_map(insn.expression))) + new_insn = insn.copy(expression=sub_map(insn.expression)) + new_insns.append(new_insn) new_substs = dict( (s.name, s.copy(expression=sub_map(s.expression))) diff --git a/loopy/schedule.py b/loopy/schedule.py index c1a1fdea0..64204f20d 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -167,6 +167,8 @@ def dump_schedule(schedule): entries.append("</%s>" % sched_item.iname) elif isinstance(sched_item, RunInstruction): entries.append(sched_item.insn_id) + elif isinstance(sched_item, Barrier): + entries.append("|") else: assert False @@ -270,6 +272,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b and len(schedule) >= debug.debug_length): debug_mode = True + #print dump_schedule(schedule), len(schedule) if debug_mode: print kernel print "--------------------------------------------" @@ -293,8 +296,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b reachable_insn_ids = set() for insn_id in all_insn_ids - scheduled_insn_ids: - if debug_mode: - print insn_id insn = kernel.id_to_insn[insn_id] schedule_now = set(insn.insn_deps) <= scheduled_insn_ids diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 4502fd13b..77a5757a0 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -1,9 +1,7 @@ from __future__ import division import numpy as np -import numpy.linalg as la import pyopencl as cl -import pyopencl.array as cl_array import loopy as lp from pyopencl.tools import pytest_generate_tests_for_pyopencl \ @@ -17,8 +15,6 @@ def test_laplacian_stiffness(ctx_factory): ctx = ctx_factory() order = "C" - # FIXME: make dim-independent - dim = 2 Nq = 40 # num. quadrature points @@ -32,14 +28,97 @@ def test_laplacian_stiffness(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " - "and 0<= ax_a, ax_b, ax_c < %(dim)d}" + "and 0<= ax_c < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [ "dPsi(a, dxi) := sum_float32(ax_c," " jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))" + ], + [ + lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), + lp.ConstantArrayArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), + lp.ArrayArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), + lp.ConstantArrayArg("w", dtype, shape=(Nq, dim), order=order), + lp.ArrayArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), + lp.ScalarArg("Nc", np.int32, approximately=1000), + ], + name="lapquad", assumptions="Nc>=1") + #knl = lp.tag_dimensions(knl, dict(ax_c="unr")) + seq_knl = knl + #print lp.preprocess_kernel(seq_knl) + #1/0 + + def variant_1(knl): + # no ILP across elements + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.add_prefetch(knl, 'jacInv', + ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) + return knl + + def variant_2(knl): + # with ILP across elements + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.add_prefetch(knl, "jacInv", + ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"]) + return knl + + def variant_3(knl): + # no ILP across elements, precompute dPsiTransf + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.precompute(knl, "dPsi", np.float32,) + #default_tag=None) + knl = lp.add_prefetch(knl, "jacInv", + ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) + return knl + + #for variant in [variant_1, variant_2]: + for variant in [variant_3]: + kernel_gen = lp.generate_loop_schedules(variant(knl), + loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) + kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) + + lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + op_count=0, op_label="GFlops", + parameters={"Nc": Nc}, print_seq_code=True, + timing_rounds=30) + + + + +def test_laplacian_stiffness_nd(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + + 1/0 # still encounters a dependency bug + + dim = 2 + + Nq = 40 # num. quadrature points + Nc = 1000 # num. cells + Nb = 20 # num. basis functions + + # K - run-time symbolic + + from pymbolic import var + Nc_sym = var("Nc") + + knl = lp.make_kernel(ctx.devices[0], + "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " + "and 0<= ax_a, ax_b, ax_c < %(dim)d}" + % dict(Nb=Nb, Nq=Nq, dim=dim), + [ + "dPsi(a, dxi) := sum_float32(@ax_c," + " jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])", + "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" + "sum_float32(ax_b, dPsi(0,ax_b)*dPsi(1,ax_b))))" ], [ lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), @@ -58,7 +137,7 @@ def test_laplacian_stiffness(ctx_factory): # no ILP across elements knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, 'jacInv', + knl = lp.add_prefetch(knl, 'jacInv', ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl @@ -67,7 +146,7 @@ def test_laplacian_stiffness(ctx_factory): knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, "jacInv", + knl = lp.add_prefetch(knl, "jacInv", ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"]) return knl @@ -77,14 +156,17 @@ def test_laplacian_stiffness(ctx_factory): knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) knl = lp.precompute(knl, "dPsi", ["a", "dxi"]) - knl = lp.add_prefetch(knl, "jacInv", + knl = lp.add_prefetch(knl, "jacInv", ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl for variant in [variant_1, variant_2]: #for variant in [variant_3]: - kernel_gen = lp.generate_loop_schedules(variant(knl), - loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) + kernel_gen = lp.generate_loop_schedules(seq_knl,#variant(knl), + loop_priority=["jacInv_dim_0", "jacInv_dim_1"], + debug=15) + for knl in kernel_gen: + pass kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, @@ -94,6 +176,7 @@ def test_laplacian_stiffness(ctx_factory): + if __name__ == "__main__": import sys if len(sys.argv) > 1: -- GitLab