Improvements to FEM assembly.

de8e8429 · Andreas Klöckner · 7f257cfb · de8e8429 · de8e8429 · de8e8429
Commit de8e8429 authored 13 years ago by Andreas Klöckner
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -443,7 +443,8 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],

    sub_map = SubstitutionCallbackMapper([subst_name], do_substs)
    for insn in kernel.instructions:
-        new_insns.append(insn.copy(expression=sub_map(insn.expression)))
+        new_insn = insn.copy(expression=sub_map(insn.expression))
+        new_insns.append(new_insn)

    new_substs = dict(
            (s.name, s.copy(expression=sub_map(s.expression)))

--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -167,6 +167,8 @@ def dump_schedule(schedule):
            entries.append("</%s>" % sched_item.iname)
        elif isinstance(sched_item, RunInstruction):
            entries.append(sched_item.insn_id)
+        elif isinstance(sched_item, Barrier):
+            entries.append("|")
        else:
            assert False

@@ -270,6 +272,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                and len(schedule) >= debug.debug_length):
            debug_mode = True

+    #print dump_schedule(schedule), len(schedule)
    if debug_mode:
        print kernel
        print "--------------------------------------------"
@@ -293,8 +296,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
    reachable_insn_ids = set()

    for insn_id in all_insn_ids - scheduled_insn_ids:
-        if debug_mode:
-            print insn_id
        insn = kernel.id_to_insn[insn_id]

        schedule_now = set(insn.insn_deps) <= scheduled_insn_ids

--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
 from __future__ import division

 import numpy as np
-import numpy.linalg as la
 import pyopencl as cl
-import pyopencl.array as cl_array
 import loopy as lp

 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -17,8 +15,6 @@ def test_laplacian_stiffness(ctx_factory):
    ctx = ctx_factory()
    order = "C"

-    # FIXME: make dim-independent
-
    dim = 2

    Nq = 40 # num. quadrature points
@@ -32,14 +28,97 @@ def test_laplacian_stiffness(ctx_factory):

    knl = lp.make_kernel(ctx.devices[0],
            "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
-            "and 0<= ax_a, ax_b, ax_c < %(dim)d}" 
+            "and 0<= ax_c < %(dim)d}"
            % dict(Nb=Nb, Nq=Nq, dim=dim),
            [
                "dPsi(a, dxi) := sum_float32(ax_c,"
                    "  jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])",
                "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
                    "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))"
+                ],
+            [
+            lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
+            lp.ConstantArrayArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order),
+            lp.ArrayArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order),
+            lp.ConstantArrayArg("w", dtype, shape=(Nq, dim), order=order),
+            lp.ArrayArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order),
+            lp.ScalarArg("Nc",  np.int32, approximately=1000),
+            ],
+            name="lapquad", assumptions="Nc>=1")

+    #knl = lp.tag_dimensions(knl, dict(ax_c="unr"))
+    seq_knl = knl
+    #print lp.preprocess_kernel(seq_knl)
+    #1/0
+
+    def variant_1(knl):
+        # no ILP across elements
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, 'jacInv',
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
+        return knl
+
+    def variant_2(knl):
+        # with ILP across elements
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, "jacInv",
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"])
+        return knl
+
+    def variant_3(knl):
+        # no ILP across elements, precompute dPsiTransf
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.precompute(knl, "dPsi", np.float32,)
+                #default_tag=None)
+        knl = lp.add_prefetch(knl, "jacInv",
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
+        return knl
+
+    #for variant in [variant_1, variant_2]:
+    for variant in [variant_3]:
+        kernel_gen = lp.generate_loop_schedules(variant(knl),
+                loop_priority=["jacInv_dim_0", "jacInv_dim_1"])
+        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
+
+        lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
+                op_count=0, op_label="GFlops",
+                parameters={"Nc": Nc}, print_seq_code=True,
+                timing_rounds=30)
+
+
+
+
+def test_laplacian_stiffness_nd(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+
+    1/0 # still encounters a dependency bug
+
+    dim = 2
+
+    Nq = 40 # num. quadrature points
+    Nc = 1000 # num. cells
+    Nb = 20 # num. basis functions
+
+    # K - run-time symbolic
+
+    from pymbolic import var
+    Nc_sym = var("Nc")
+
+    knl = lp.make_kernel(ctx.devices[0],
+            "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
+            "and 0<= ax_a, ax_b, ax_c < %(dim)d}"
+            % dict(Nb=Nb, Nq=Nq, dim=dim),
+            [
+                "dPsi(a, dxi) := sum_float32(@ax_c,"
+                    "  jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])",
+                "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
+                    "sum_float32(ax_b, dPsi(0,ax_b)*dPsi(1,ax_b))))"
                ],
            [
            lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
@@ -58,7 +137,7 @@ def test_laplacian_stiffness(ctx_factory):
        # no ILP across elements
        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-        knl = lp.add_prefetch(knl, 'jacInv', 
+        knl = lp.add_prefetch(knl, 'jacInv',
                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
        return knl

@@ -67,7 +146,7 @@ def test_laplacian_stiffness(ctx_factory):
        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
        knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-        knl = lp.add_prefetch(knl, "jacInv", 
+        knl = lp.add_prefetch(knl, "jacInv",
                ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"])
        return knl

@@ -77,14 +156,17 @@ def test_laplacian_stiffness(ctx_factory):
        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
        knl = lp.precompute(knl, "dPsi",
                ["a", "dxi"])
-        knl = lp.add_prefetch(knl, "jacInv", 
+        knl = lp.add_prefetch(knl, "jacInv",
                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
        return knl

    for variant in [variant_1, variant_2]:
    #for variant in [variant_3]:
-        kernel_gen = lp.generate_loop_schedules(variant(knl),
-                loop_priority=["jacInv_dim_0", "jacInv_dim_1"])
+        kernel_gen = lp.generate_loop_schedules(seq_knl,#variant(knl),
+                loop_priority=["jacInv_dim_0", "jacInv_dim_1"],
+                debug=15)
+        for knl in kernel_gen:
+            pass
        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))

        lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
@@ -94,6 +176,7 @@ def test_laplacian_stiffness(ctx_factory):



+
 if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1: