From 26a50987c56ed8e0d78613798aaf03801b891b6f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 16 Jan 2012 02:31:03 -0500
Subject: [PATCH] Test float4 functionality.

---
 test/test_linalg.py | 164 +++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 92 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index ef0e627bd..96cbb869c 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -87,71 +87,60 @@ def get_suitable_size(ctx):
 
 
 
+def check_float4(result, ref_result):
+    for comp in ["x", "y", "z", "w"]:
+        return np.allclose(ref_result[comp], result[comp], rtol=1e-3, atol=1e-3)
+
 def test_axpy(ctx_factory):
-    dtype = np.float32
     ctx = ctx_factory()
-    order = "C"
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = 20*1024**2
 
-    knl = lp.make_kernel(ctx.devices[0],
-            "[n] -> {[i]: 0<=i<n}",
-            [
-                "z[i] = a*x[i]+b*y[i]"
-                ],
-            [
-                lp.ScalarArg("a", dtype),
-                lp.ArrayArg("x", dtype, shape="n,"),
-                lp.ScalarArg("b", dtype),
-                lp.ArrayArg("y", dtype, shape="n,"),
-                lp.ArrayArg("z", dtype, shape="n,"),
-                lp.ScalarArg("n", np.int32, approximately=n),
-                ],
-            name="axpy", assumptions="n>=1")
-
-    def variant_seq(knl):
-        return knl
-
-    def variant_cpu(knl):
-        unroll = 16
-        block_size = unroll*4096
-        knl = lp.split_dimension(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1))
-        knl = lp.split_dimension(knl, "i_inner", unroll, inner_tag="unr")
-        return knl
+    vec = cl_array.vec
 
-    def variant_gpu(knl):
-        unroll = 4
-        block_size = 256
-        knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0", slabs=(0, 1))
-        knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0")
-        return knl
+    for dtype, check, a, b in [
+            (vec.float4, check_float4,
+                vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)),
+            (np.float32, None, 5, 7),
+            ]:
+        knl = lp.make_kernel(ctx.devices[0],
+                "[n] -> {[i]: 0<=i<n}",
+                [
+                    "z[i] = a*x[i]+b*y[i]"
+                    ],
+                [
+                    lp.ScalarArg("a", dtype),
+                    lp.ArrayArg("x", dtype, shape="n,"),
+                    lp.ScalarArg("b", dtype),
+                    lp.ArrayArg("y", dtype, shape="n,"),
+                    lp.ArrayArg("z", dtype, shape="n,"),
+                    lp.ScalarArg("n", np.int32, approximately=n),
+                    ],
+                name="axpy", assumptions="n>=1")
 
-    #x = cl_array.to_device(queue, np.random.rand(n).astype(dtype))
-    #y = cl_array.to_device(queue, np.random.rand(n).astype(dtype))
-    x = cl_random.rand(queue, n, dtype=dtype, luxury=2)
-    y = cl_random.rand(queue, n, dtype=dtype, luxury=2)
-    #print np.isnan(x.get()).any()
-    #1/0
-    z = cl_array.zeros_like(x)
-    refsol = (2*x+3*y).get()
-
-    for variant in [variant_seq, variant_cpu, variant_gpu]:
-        kernel_gen = lp.generate_loop_schedules(variant(knl),
-                loop_priority=["i_inner_outer"])
-        kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
+        seq_knl = knl
 
-        def launcher(kernel, gsize, lsize, check):
-            evt = kernel(queue, gsize(n), lsize(n), 2, x.data, 3, y.data, z.data, n,
-                    g_times_l=True)
+        def variant_cpu(knl):
+            unroll = 16
+            block_size = unroll*4096
+            knl = lp.split_dimension(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1))
+            knl = lp.split_dimension(knl, "i_inner", unroll, inner_tag="unr")
+            return knl
 
-            if check:
-                check_error(refsol, z.get())
+        def variant_gpu(knl):
+            unroll = 4
+            block_size = 256
+            knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0", slabs=(0, 1))
+            knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0")
+            return knl
 
-            return evt
+        for variant in [variant_cpu, variant_gpu]:
+            kernel_gen = lp.generate_loop_schedules(variant(knl))
+            kernel_gen = lp.check_kernels(kernel_gen, dict(n=n))
 
-        lp.drive_timing_run(kernel_gen, queue, launcher, 5*n)
+            lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
+                    op_count=np.dtype(dtype).itemsize*n*3/1e9, op_label="GBytes",
+                    parameters={"a": a, "b": b, "n": n}, check_result=check)
 
 
 
@@ -194,52 +183,43 @@ def test_transpose(ctx_factory):
 
 
 def test_plain_matrix_mul(ctx_factory):
-    dtype = np.float32
     ctx = ctx_factory()
     order = "C"
-    queue = cl.CommandQueue(ctx,
-            properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     n = get_suitable_size(ctx)
 
-    knl = lp.make_kernel(ctx.devices[0],
-            "{[i,j,k]: 0<=i,j,k<%d}" % n,
-            [
-                "c[i, j] = sum_float32(k, a[i, k]*b[k, j])"
-                ],
-            [
-                lp.ArrayArg("a", dtype, shape=(n, n), order=order),
-                lp.ArrayArg("b", dtype, shape=(n, n), order=order),
-                lp.ArrayArg("c", dtype, shape=(n, n), order=order),
-                ],
-            name="matmul")
-
-    knl = lp.split_dimension(knl, "i", 16,
-            outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_dimension(knl, "j", 16,
-            outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 16)
-    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
-    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])
-
-    kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, {})
-
-    a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
-    c = cl_array.empty_like(a)
-    refsol = np.dot(a.get(), b.get())
-
-    def launcher(kernel, gsize, lsize, check):
-        evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data,
-                g_times_l=True)
+    for dtype, check, vec_size, reduction_func in [
+            (cl_array.vec.float4, check_float4, 4, "sum_vec_float4"),
+            (np.float32, None, 1, "sum_float32"),
+            ]:
+        knl = lp.make_kernel(ctx.devices[0],
+                "{[i,j,k]: 0<=i,j,k<%d}" % n,
+                [
+                    "c[i, j] = %s(k, a[i, k]*b[k, j])" % reduction_func
+                    ],
+                [
+                    lp.ArrayArg("a", dtype, shape=(n, n), order=order),
+                    lp.ArrayArg("b", dtype, shape=(n, n), order=order),
+                    lp.ArrayArg("c", dtype, shape=(n, n), order=order),
+                    ],
+                name="matmul")
+
+        ref_knl = knl
 
-        if check:
-            check_error(refsol, c.get())
+        knl = lp.split_dimension(knl, "i", 16,
+                outer_tag="g.0", inner_tag="l.1")
+        knl = lp.split_dimension(knl, "j", 16,
+                outer_tag="g.1", inner_tag="l.0")
+        knl = lp.split_dimension(knl, "k", 16)
+        knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+        knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])
 
-        return evt
+        kernel_gen = lp.generate_loop_schedules(knl)
+        kernel_gen = lp.check_kernels(kernel_gen, {})
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+        lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen,
+                op_count=vec_size*2*n**3/1e9, op_label="GFlops/s",
+                parameters={"n": n}, check_result=check)
 
 
 
-- 
GitLab