From de8e84299100bb0d28f1b4dcd111ab6805332ac6 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 11 Nov 2011 12:00:57 -0500
Subject: [PATCH] Improvements to FEM assembly.

---
 loopy/cse.py              |   3 +-
 loopy/schedule.py         |   5 +-
 test/test_fem_assembly.py | 103 ++++++++++++++++++++++++++++++++++----
 3 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/loopy/cse.py b/loopy/cse.py
index f2086b2c2..124f462b9 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -443,7 +443,8 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[],
 
     sub_map = SubstitutionCallbackMapper([subst_name], do_substs)
     for insn in kernel.instructions:
-        new_insns.append(insn.copy(expression=sub_map(insn.expression)))
+        new_insn = insn.copy(expression=sub_map(insn.expression))
+        new_insns.append(new_insn)
 
     new_substs = dict(
             (s.name, s.copy(expression=sub_map(s.expression)))
diff --git a/loopy/schedule.py b/loopy/schedule.py
index c1a1fdea0..64204f20d 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -167,6 +167,8 @@ def dump_schedule(schedule):
             entries.append("</%s>" % sched_item.iname)
         elif isinstance(sched_item, RunInstruction):
             entries.append(sched_item.insn_id)
+        elif isinstance(sched_item, Barrier):
+            entries.append("|")
         else:
             assert False
 
@@ -270,6 +272,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
                 and len(schedule) >= debug.debug_length):
             debug_mode = True
 
+    #print dump_schedule(schedule), len(schedule)
     if debug_mode:
         print kernel
         print "--------------------------------------------"
@@ -293,8 +296,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
     reachable_insn_ids = set()
 
     for insn_id in all_insn_ids - scheduled_insn_ids:
-        if debug_mode:
-            print insn_id
         insn = kernel.id_to_insn[insn_id]
 
         schedule_now = set(insn.insn_deps) <= scheduled_insn_ids
diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py
index 4502fd13b..77a5757a0 100644
--- a/test/test_fem_assembly.py
+++ b/test/test_fem_assembly.py
@@ -1,9 +1,7 @@
 from __future__ import division
 
 import numpy as np
-import numpy.linalg as la
 import pyopencl as cl
-import pyopencl.array as cl_array
 import loopy as lp
 
 from pyopencl.tools import pytest_generate_tests_for_pyopencl \
@@ -17,8 +15,6 @@ def test_laplacian_stiffness(ctx_factory):
     ctx = ctx_factory()
     order = "C"
 
-    # FIXME: make dim-independent
-
     dim = 2
 
     Nq = 40 # num. quadrature points
@@ -32,14 +28,97 @@ def test_laplacian_stiffness(ctx_factory):
 
     knl = lp.make_kernel(ctx.devices[0],
             "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
-            "and 0<= ax_a, ax_b, ax_c < %(dim)d}" 
+            "and 0<= ax_c < %(dim)d}"
             % dict(Nb=Nb, Nq=Nq, dim=dim),
             [
                 "dPsi(a, dxi) := sum_float32(ax_c,"
                     "  jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])",
                 "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
                     "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))"
+                ],
+            [
+            lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
+            lp.ConstantArrayArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order),
+            lp.ArrayArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order),
+            lp.ConstantArrayArg("w", dtype, shape=(Nq, dim), order=order),
+            lp.ArrayArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order),
+            lp.ScalarArg("Nc",  np.int32, approximately=1000),
+            ],
+            name="lapquad", assumptions="Nc>=1")
 
+    #knl = lp.tag_dimensions(knl, dict(ax_c="unr"))
+    seq_knl = knl
+    #print lp.preprocess_kernel(seq_knl)
+    #1/0
+
+    def variant_1(knl):
+        # no ILP across elements
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, 'jacInv',
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
+        return knl
+
+    def variant_2(knl):
+        # with ILP across elements
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.add_prefetch(knl, "jacInv",
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"])
+        return knl
+
+    def variant_3(knl):
+        # no ILP across elements, precompute dPsiTransf
+        knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
+        knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
+        knl = lp.precompute(knl, "dPsi", np.float32,)
+                #default_tag=None)
+        knl = lp.add_prefetch(knl, "jacInv",
+                ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
+        return knl
+
+    #for variant in [variant_1, variant_2]:
+    for variant in [variant_3]:
+        kernel_gen = lp.generate_loop_schedules(variant(knl),
+                loop_priority=["jacInv_dim_0", "jacInv_dim_1"])
+        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
+
+        lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
+                op_count=0, op_label="GFlops",
+                parameters={"Nc": Nc}, print_seq_code=True,
+                timing_rounds=30)
+
+
+
+
+def test_laplacian_stiffness_nd(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+    order = "C"
+
+    1/0 # still encounters a dependency bug
+
+    dim = 2
+
+    Nq = 40 # num. quadrature points
+    Nc = 1000 # num. cells
+    Nb = 20 # num. basis functions
+
+    # K - run-time symbolic
+
+    from pymbolic import var
+    Nc_sym = var("Nc")
+
+    knl = lp.make_kernel(ctx.devices[0],
+            "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
+            "and 0<= ax_a, ax_b, ax_c < %(dim)d}"
+            % dict(Nb=Nb, Nq=Nq, dim=dim),
+            [
+                "dPsi(a, dxi) := sum_float32(@ax_c,"
+                    "  jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])",
+                "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
+                    "sum_float32(ax_b, dPsi(0,ax_b)*dPsi(1,ax_b))))"
                 ],
             [
             lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
@@ -58,7 +137,7 @@ def test_laplacian_stiffness(ctx_factory):
         # no ILP across elements
         knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
         knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-        knl = lp.add_prefetch(knl, 'jacInv', 
+        knl = lp.add_prefetch(knl, 'jacInv',
                 ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
         return knl
 
@@ -67,7 +146,7 @@ def test_laplacian_stiffness(ctx_factory):
         knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1))
         knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp")
         knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
-        knl = lp.add_prefetch(knl, "jacInv", 
+        knl = lp.add_prefetch(knl, "jacInv",
                 ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"])
         return knl
 
@@ -77,14 +156,17 @@ def test_laplacian_stiffness(ctx_factory):
         knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"})
         knl = lp.precompute(knl, "dPsi",
                 ["a", "dxi"])
-        knl = lp.add_prefetch(knl, "jacInv", 
+        knl = lp.add_prefetch(knl, "jacInv",
                 ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"])
         return knl
 
     for variant in [variant_1, variant_2]:
     #for variant in [variant_3]:
-        kernel_gen = lp.generate_loop_schedules(variant(knl),
-                loop_priority=["jacInv_dim_0", "jacInv_dim_1"])
+        kernel_gen = lp.generate_loop_schedules(seq_knl,#variant(knl),
+                loop_priority=["jacInv_dim_0", "jacInv_dim_1"],
+                debug=15)
+        for knl in kernel_gen:
+            pass
         kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
 
         lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen,
@@ -94,6 +176,7 @@ def test_laplacian_stiffness(ctx_factory):
 
 
 
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
-- 
GitLab