diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index e76acc2e8c0a6ad9c90ff13460726c50ead5c0dc..e497928e65c2e86bf40b2c85aefc35d29506903e 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -10,12 +10,17 @@ import loopy as lp FAST_OPTIONS = ["-cl-mad-enable", "-cl-fast-relaxed-math", "-cl-no-signed-zeros", "-cl-strict-aliasing"] -def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, order="C", ran_factor=1, od=0): +def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, + order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): if isinstance(shape, int): shape = (shape, shape) + l = max(shape) + eye_ish = id_factor*np.eye(l, k=od) + if inc_factor: + eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) ary = np.asarray( ran_factor*np.random.randn(*shape) - + 5*np.eye(max(shape), k=od)[:shape[0], :shape[1]], + + eye_ish[:shape[0], :shape[1]], dtype=dtype, order=order) return cl_array.to_device(queue, ary) @@ -27,9 +32,12 @@ DO_CHECK = True DEBUG_PREAMBLE = r""" #pragma OPENCL EXTENSION cl_amd_printf: enable - #define IFDIAG if (i_outer*16+i_inner == j_outer*16+j_inner) - #define TST(S) IFDIAG if (i_outer*16+i_inner == 0) \ - printf("ko=%d ki=%d" #S "\n", k_outer, k_inner); + #define MY_J (j_outer*64+j_inner_outer*16+j_inner_inner) + #define MY_I (i_outer*16+i_inner) + #define IFDIAG if (MY_I == MY_J) + #define TST(S) if (MY_J == 144 && MY_I == 16-48) \ + for (int aa = 0; aa < 16: ++ab) \ + for (int bb = 0; bb < 16: ++bb) """ @@ -38,20 +46,21 @@ DEBUG_PREAMBLE = r""" def check_error(refsol, sol): rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") if DO_CHECK and rel_err > 1e-5: - if 1: + if 0: import matplotlib.pyplot as pt pt.imshow(refsol-sol) pt.colorbar() pt.show() - elif 0: + elif 1: print "---------------------------" print "ACTUAL" print "---------------------------" - print sol + np.set_printoptions(threshold=1000000, linewidth=200) + print sol[:,-16:] print "---------------------------" print "CORRECT" print "---------------------------" - print refsol + print refsol[-16:,-16:] raise RuntimeError("check failed, rel err=%g" % rel_err) @@ -168,101 +177,6 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): force_rebuild=True) -def dg_matrix_mul(ctx_factory=cl.create_some_context): - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - Np = 84 - Np_padded = 96 - K = 20000 - dim = 3 - num_flds = 2 - - from pymbolic import var - fld = var("fld") - matrix_names = ["d%d" % i for i in range(dim)] - i, j, k = [var(s) for s in "i j k".split()] - - fld_strides = (1, Np_padded) - - knl = lp.LoopKernel(ctx.devices[0], - "{[i,j,k]: 0<=i,j< %d and 0<=k<%d}" % (Np, K), - [ - (var(mn+"fld%d" % ifld)[i, k], - var(mn)[i, j]*var("fld%d" % ifld)[j, k]) - for mn in matrix_names - for ifld in range(num_flds) - ], - [lp.ImageArg(mn, dtype, 2) for mn in matrix_names] - + [lp.ArrayArg("fld%d" % ifld, dtype, - strides=fld_strides) - for ifld in range(num_flds) - ] - + [lp.ArrayArg(mn+"fld%d" % ifld, dtype, - strides=fld_strides) - for ifld in range(num_flds) - for mn in matrix_names - ], - name="dg_matmul") - - ilp = 4 - knl = lp.split_dimension(knl, "i", 30, 32, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 16*ilp, outer_tag="g.1") - knl = lp.split_dimension(knl, "k_inner", 16, outer_tag="ilp", inner_tag="l.1") - - assert Np % 2 == 0 - #knl = lp.split_dimension(knl, "j", Np//2) - #knl = lp.split_dimension(knl, "k", 32) - - #for mn in matrix_names: - #knl = lp.add_prefetch(knl, mn, ["j", "i_inner"]) - for ifld in range(num_flds): - knl = lp.add_prefetch(knl, 'fld%d' % ifld, - ["k_inner_outer", "k_inner_inner", "j"]) - assert knl.get_invalid_reason() is None - - kernel_gen = list(lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl))[:1] - - matrices = [ - make_well_conditioned_dev_matrix(queue, Np, dtype=dtype, order="C") - for mn in matrix_names] - flds = [ - make_well_conditioned_dev_matrix(queue, (Np_padded, K), dtype=dtype, order="F") - for ifld in range(num_flds)] - outputs = [cl_array.empty_like(flds[0]) - for ifld in range(num_flds) - for mn in matrix_names] - - ref_soln = [np.dot(mat.get(), fld.get()[:Np]) - for fld in flds - for mat in matrices] - - mat_images = [ - cl.image_from_array(ctx, mat.get(), 1) for mat in matrices] - - def launcher(kernel, gsize, lsize, check): - args = mat_images + [fld.data for fld in flds] + [out.data for out in outputs] - kwargs = dict(g_times_l=True) - evt = kernel(queue, gsize(), lsize(), *args, g_times_l=True) - - if check: - for out, ref in zip(outputs, ref_soln): - check_error(ref, out.get()[:Np]) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, num_flds*dim*2*(Np**2)*K, - options=FAST_OPTIONS + ["-cl-nv-verbose"], - force_rebuild=True, #, edit=True - print_code=False - ) - - - def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): @@ -272,7 +186,7 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = 16*100 + n = 16*10 from pymbolic import var a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] @@ -288,24 +202,28 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): #lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], - name="matmul") + name="matmul", preamble=DEBUG_PREAMBLE) - ilp = 8 + ilp = 4 knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", ilp*16, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", 16, outer_tag="ilp", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) # conflict-free knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) - assert knl.get_invalid_reason() is None + #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) + inv_reason = knl.get_invalid_reason() + assert inv_reason is None, inv_reason kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=0, id_factor=1) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=0, id_factor=0, inc_factor=1000) c = cl_array.empty_like(a) + c.fill(-17) refsol = np.dot(a.get(), b.get()) a_img = cl.image_from_array(ctx, a.get(), 1) b_img = cl.image_from_array(ctx, b.get(), 1) @@ -321,7 +239,7 @@ def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, options=FAST_OPTIONS,# + ["-cl-nv-verbose"], - force_rebuild=True, edit_code=False) + force_rebuild=True, edit_code=True) @@ -386,6 +304,103 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): +def dg_matrix_mul(ctx_factory=cl.create_some_context): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + Np = 84 + Np_padded = 96 + K = 20000 + dim = 3 + num_flds = 2 + + from pymbolic import var + fld = var("fld") + matrix_names = ["d%d" % i for i in range(dim)] + i, j, k = [var(s) for s in "i j k".split()] + + fld_strides = (1, Np_padded) + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j< %d and 0<=k<%d}" % (Np, K), + [ + (var(mn+"fld%d" % ifld)[i, k], + var(mn)[i, j]*var("fld%d" % ifld)[j, k]) + for mn in matrix_names + for ifld in range(num_flds) + ], + [lp.ImageArg(mn, dtype, 2) for mn in matrix_names] + + [lp.ArrayArg("fld%d" % ifld, dtype, + strides=fld_strides) + for ifld in range(num_flds) + ] + + [lp.ArrayArg(mn+"fld%d" % ifld, dtype, + strides=fld_strides) + for ifld in range(num_flds) + for mn in matrix_names + ], + name="dg_matmul") + + ilp = 4 + knl = lp.split_dimension(knl, "i", 30, 32, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 16*ilp, outer_tag="g.1") + knl = lp.split_dimension(knl, "k_inner", 16, outer_tag="ilp", inner_tag="l.1") + + assert Np % 2 == 0 + #knl = lp.split_dimension(knl, "j", Np//2) + #knl = lp.split_dimension(knl, "k", 32) + + #for mn in matrix_names: + #knl = lp.add_prefetch(knl, mn, ["j", "i_inner"]) + for ifld in range(num_flds): + knl = lp.add_prefetch(knl, 'fld%d' % ifld, + ["k_inner_outer", "k_inner_inner", "j"]) + assert knl.get_invalid_reason() is None + + kernel_gen = list(lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl))[:1] + + matrices = [ + make_well_conditioned_dev_matrix(queue, Np, dtype=dtype, order="C") + for mn in matrix_names] + flds = [ + make_well_conditioned_dev_matrix(queue, (Np_padded, K), dtype=dtype, order="F") + for ifld in range(num_flds)] + outputs = [cl_array.empty_like(flds[0]) + for ifld in range(num_flds) + for mn in matrix_names] + + ref_soln = [np.dot(mat.get(), fld.get()[:Np]) + for fld in flds + for mat in matrices] + + mat_images = [ + cl.image_from_array(ctx, mat.get(), 1) for mat in matrices] + + def launcher(kernel, gsize, lsize, check): + args = mat_images + [fld.data for fld in flds] + [out.data for out in outputs] + kwargs = dict(g_times_l=True) + evt = kernel(queue, gsize(), lsize(), *args, g_times_l=True) + + if check: + for out, ref in zip(outputs, ref_soln): + check_error(ref, out.get()[:Np]) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, num_flds*dim*2*(Np**2)*K, + options=FAST_OPTIONS + ["-cl-nv-verbose"], + force_rebuild=True, #, edit=True + print_code=False + ) + + + + + def main_elwise_scaled_matrix_mul(): Np = 64 K = 2000 @@ -469,4 +484,4 @@ if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: - plain_matrix_mul() + image_matrix_mul_ilp() diff --git a/loopy/__init__.py b/loopy/__init__.py index 570bdd089b2e29963d9ab2e49dad9f981b707050..f4d47e2c7b997ea7870651e9f808659f4be692c4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,6 +22,7 @@ register_mpz_with_pymbolic() +# TODO: Constant memory # TODO: Reuse of previously split dimensions for prefetch # (Or general merging) # TODO: ILP Unroll @@ -31,6 +32,8 @@ register_mpz_with_pymbolic() # ILP must be outside of reduction loops # Therfore, there are prefetches inside ILPs # TODO: Debug 1 ILP +# FIXME: Random equality constraints +# TODO: Use increment for ILP? # TODO: Try, fix reg. prefetch (DG example) / CSEs # TODO: Custom reductions per red. axis