diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index e8618098c299a16dee4cc8b063527f2e3d80ea02..5bebaefc294aebc6e40ab6abc77d0aecc7e6a558 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -7,7 +7,7 @@ import loopy as lp -def make_well_condition_dev_matrix(queue, n, dtype=np.float32, order="C"): +def make_well_conditioned_dev_matrix(queue, n, dtype=np.float32, order="C"): return cl_array.to_device(queue, np.asarray(np.random.randn(n, n) + 5*np.eye(n), dtype=dtype, order=order)) @@ -15,6 +15,18 @@ def make_well_condition_dev_matrix(queue, n, dtype=np.float32, order="C"): +def check_error(refsol, sol): + rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") + if rel_err > 1e-5: + import matplotlib.pyplot as pt + pt.imshow(refsol-sol) + pt.colorbar() + pt.show() + raise RuntimeError("check failed, rel err=%g" % rel_err) + + + + def plain_matrix_mul(ctx_factory=cl.create_some_context): dtype = np.float32 ctx = ctx_factory() @@ -48,8 +60,8 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) @@ -58,14 +70,7 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): g_times_l=True) if check: - sol = c.get() - rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") - if rel_err > 1e-5: - import matplotlib.pyplot as pt - pt.imshow(refsol-sol) - pt.colorbar() - pt.show() - raise RuntimeError("check failed") + check_error(refsol, c.get()) return evt @@ -74,6 +79,59 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): +def image_matrix_mul(ctx_factory=cl.create_some_context): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = 16*100 + from pymbolic import var + a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + (c[i, j], a[i, k]*b[k, j]) + ], + [ + lp.ImageArg("a", dtype, 2), + lp.ArrayArg("b", dtype, shape=(n, n), order=order), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") + knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 4) + knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) + assert knl.get_invalid_reason() is None + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + c = cl_array.empty_like(a) + refsol = np.dot(a.get(), b.get()) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + + + + + def fancy_matrix_mul(ctx_factory=cl.create_some_context): dtype = np.float32 ctx = ctx_factory() @@ -82,7 +140,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): order = "F" - n = 16*30 + n = 16*100 from pymbolic import var a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] @@ -100,7 +158,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 4, inner_tag="unr1") + knl = lp.split_dimension(knl, "k", 19) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) assert knl.get_invalid_reason() is None @@ -108,8 +166,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) @@ -118,14 +176,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): g_times_l=True) if check: - sol = c.get() - rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") - if rel_err > 1e-5: - import matplotlib.pyplot as pt - pt.imshow(refsol-sol) - pt.colorbar() - pt.show() - raise RuntimeError("check failed") + check_error(refsol, c.get()) return evt @@ -217,4 +268,4 @@ if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: - fancy_matrix_mul() + image_matrix_mul()