from __future__ import division import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array as cl_array import pyopencl.clrandom as cl_random import loopy as lp from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): if isinstance(shape, int): shape = (shape, shape) l = max(shape) eye_ish = id_factor*np.eye(l, k=od) if inc_factor: eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) ary = np.asarray( ran_factor*np.random.randn(*shape) + eye_ish[:shape[0], :shape[1]], dtype=dtype, order=order) return cl_array.to_device(queue, ary) DO_CHECK = True DEBUG_PREAMBLE = r""" #pragma OPENCL EXTENSION cl_amd_printf: enable #define MY_J (j_outer*64+j_inner_outer*16+j_inner_inner) #define MY_I (i_outer*16+i_inner) #define IFDIAG if (MY_I == MY_J) #define TST(S) if (MY_J == 144 && MY_I == 16-48) \ for (int aa = 0; aa < 16: ++ab) \ for (int bb = 0; bb < 16: ++bb) """ def check_error(refsol, sol): if not DO_CHECK: return if sol.shape == 2: norm_order = "fro" else: norm_order = 2 rel_err = la.norm(refsol-sol, norm_order)/la.norm(refsol, norm_order) if rel_err > 1e-5 or np.isinf(rel_err) or np.isnan(rel_err): if 1: import matplotlib.pyplot as pt pt.imshow(refsol-sol, interpolation="nearest") pt.colorbar() pt.show() elif 0: print "---------------------------" print "ACTUAL" print "---------------------------" np.set_printoptions(threshold=1000000, linewidth=200) print sol[:16,:16] print "---------------------------" print "CORRECT" print "---------------------------" print refsol[:16,:16] raise RuntimeError("check failed, rel err=%g" % rel_err) def get_suitable_size(ctx): dev, = ctx.devices if dev.type == cl.device_type.CPU: return 160 else: return 1600 def check_float4(result, ref_result): for comp in ["x", "y", "z", "w"]: return np.allclose(ref_result[comp], result[comp], rtol=1e-3, atol=1e-3), None def test_axpy(ctx_factory): ctx = ctx_factory() n = 20*1024**2 vec = cl_array.vec for dtype, check, a, b in [ (np.complex64, None, 5, 7), (vec.float4, check_float4, vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)), (np.float32, None, 5, 7), ]: knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i]: 0<=i<n}", [ "z[i] = a*x[i]+b*y[i]" ], [ lp.ScalarArg("a", dtype), lp.ArrayArg("x", dtype, shape="n,"), lp.ScalarArg("b", dtype), lp.ArrayArg("y", dtype, shape="n,"), lp.ArrayArg("z", dtype, shape="n,"), lp.ScalarArg("n", np.int32, approximately=n), ], name="axpy", assumptions="n>=1", preamble=""" #include <pyopencl-complex.h> """) seq_knl = knl def variant_cpu(knl): unroll = 16 block_size = unroll*4096 knl = lp.split_dimension(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_dimension(knl, "i_inner", unroll, inner_tag="unr") return knl def variant_gpu(knl): unroll = 4 block_size = 256 knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") return knl for variant in [variant_cpu, variant_gpu]: kernel_gen = lp.generate_loop_schedules(variant(knl)) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=np.dtype(dtype).itemsize*n*3/1e9, op_label="GBytes", parameters={"a": a, "b": b, "n": n}, check_result=check) def test_transpose(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), ], name="transpose") seq_knl = knl knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, {}) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=dtype.itemsize*n**2*2/1e9, op_label="GByte", parameters={}) def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size, reduction_func in [ (cl_array.vec.float4, check_float4, 4, "sum_vec_float4"), (np.float32, None, 1, "sum_float32"), ]: knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = %s(k, a[i, k]*b[k, j])" % reduction_func ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, {}) lp.auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=vec_size*2*n**3/1e9, op_label="GFlops/s", parameters={"n": n}, check_result=check) def test_variable_size_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ "label: c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), lp.ScalarArg("n", np.int32, approximately=n), ], name="matmul", assumptions="n >= 16") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) def test_rank_one(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = int(get_suitable_size(ctx)**(2.7/2)) knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j]: 0<=i,j<n}", [ "label: c[i, j] = a[i]*b[j]" ], [ lp.ArrayArg("a", dtype, shape=(n,), order=order), lp.ArrayArg("b", dtype, shape=(n,), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), lp.ScalarArg("n", np.int32, approximately=n), ], name="rank_one", assumptions="n >= 16") def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl def variant_2(knl): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl def variant_3(knl): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a", ["i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner"]) return knl def variant_4(knl): knl = lp.split_dimension(knl, "i", 256, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_dimension(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) knl = lp.split_dimension(knl, "i_inner", 16, inner_tag="l.0") knl = lp.split_dimension(knl, "j_inner", 16, inner_tag="l.1") knl = lp.split_dimension(knl, "a_dim_0", 16, outer_tag="l.1", inner_tag="l.0") knl = lp.split_dimension(knl, "b_dim_0", 16, outer_tag="l.1", inner_tag="l.0") return knl seq_knl = knl for variant in [variant_1, variant_2, variant_3, variant_4]: #for variant in [variant_4]: kernel_gen = lp.generate_loop_schedules(variant(knl)) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=np.dtype(dtype).itemsize*n**2/1e9, op_label="GBytes", parameters={"n": n}) def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) n = 6*16*2 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) def test_intel_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) n = 128+32 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") i_reg = 4 j_reg = 4 i_chunks = 16 j_chunks = 16 knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_dimension(knl, "k", 16) #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"]) knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"]) # FIXME: Grouped prefetch #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) #knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) kernel_gen = lp.generate_loop_schedules(knl) #hints=["k_outer", "k_inner_outer", "k_inner_inner"] kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) def test_magma_fermi_matrix_mul(ctx_factory): 1/0 # not updated to new conventions dtype = np.float32 ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) n = 6*16*16 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, 2), lp.ImageArg("b", dtype, 2), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") i_reg = 4 j_reg = 4 i_chunks = 16 j_chunks = 16 knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_dimension(knl, "k", 16) #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) kernel_gen = lp.generate_loop_schedules(knl) #hints=["k_outer", "k_inner_outer", "k_inner_inner"] kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) a_img = cl.image_from_array(ctx, a.get(), 1) b_img = cl.image_from_array(ctx, b.get(), 1) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) def test_image_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, 2), lp.ImageArg("b", dtype, 2), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) # conflict-free knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) a_img = cl.image_from_array(ctx, a.get(), 1) b_img = cl.image_from_array(ctx, b.get(), 1) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) def test_image_matrix_mul_ilp(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl ilp = 4 knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") j_inner_split = 4 knl = lp.split_dimension(knl, "j", ilp*j_inner_split, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 2) # conflict-free? knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=2*n**3/1e9, op_label="GFlops", parameters={}) def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape="(n, n)", order=order), lp.ArrayArg("b", dtype, shape="(n, n)", order=order), lp.ArrayArg("c", dtype, shape="(n, n)", order=order), lp.ScalarArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 16, slabs=(0,1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, ran_factor=0) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, ran_factor=0) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(n), lsize(n), a.data, b.data, c.data, n, g_times_l=True) if check: check_error(refsol, c.get()) return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) if __name__ == "__main__": import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: from py.test.cmdline import main main([__file__])