diff --git a/loopy/__init__.py b/loopy/__init__.py index 6b0a1dcd41ed50f1e4ccdc7f3a39685ce4a50c50..03d0ddb507aa7c130c7e823ce35a18cf8b55d5d9 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -21,10 +21,12 @@ register_mpz_with_pymbolic() # TODO: Custom reductions per red. axis # TODO: Functions # TODO: Common subexpressions -# TODO: Array common subexpressions +# TODO: Array common subexpressions (shared and private!) +# TODO: ILP arrays # FIXME: support non-reductive dimensions (what did I mean here?) # FIXME: write names should be assigned during scheduling # FIXME: screwy lower bounds in ILP +# FIXME: Leading syncthreads elimination # TODO: Divisibility # TODO: Try, fix indirect addressing diff --git a/loopy/kernel.py b/loopy/kernel.py index 30aa1c515addeb4b01cf075a0a19f2efc67b76a3..2838d09d00246811d6889ee7d5615ce66df0a920 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -433,7 +433,13 @@ class LoopKernel(Record): def split_dimension(self, name, inner_length, padded_length=None, outer_name=None, inner_name=None, outer_tag=None, inner_tag=None, - outer_slab_increments=(0, -1)): + outer_slab_increments=(0, -1), no_slabs=None): + + if name not in self.all_inames(): + raise ValueError("cannot split loop for unknown variable '%s'" % name) + + if no_slabs: + outer_slab_increments = (0, 0) outer_tag = parse_tag(outer_tag) inner_tag = parse_tag(inner_tag) diff --git a/loopy/schedule.py b/loopy/schedule.py index a400f077fa33f6e21ef889c1a3bc91e05f2653d7..ee5cab82a023fb2a7f8b1b524c19d63193564ffc 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -112,7 +112,7 @@ def generate_loop_schedules(kernel, hints=[]): hints = hints[1:] if hints and hints[0] in schedulable: - schedulable = set(hints[0]) + schedulable = set([hints[0]]) if schedulable: # have a schedulable variable? schedule a loop for it, recurse diff --git a/test/test_matmul.py b/test/test_matmul.py index e688e28382e021540b0dcb89ec8c19184d14f425..d7b37e73690f3e4718beb69837545cc3ba810a75 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -159,9 +159,11 @@ def test_plain_matrix_mul(ctx_factory): ], name="matmul") - knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 16) + knl = lp.split_dimension(knl, "i", 16, + outer_tag="g.0", inner_tag="l.1", no_slabs=True) + knl = lp.split_dimension(knl, "j", 16, + outer_tag="g.1", inner_tag="l.0", no_slabs=True) + knl = lp.split_dimension(knl, "k", 16, no_slabs=True) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) assert knl.get_problems({})[0] <= 2 @@ -189,6 +191,183 @@ def test_plain_matrix_mul(ctx_factory): +def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = 6*16*2 + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + "c[i, j] = a[i, k]*b[k, j]" + ], + [ + lp.ArrayArg("a", dtype, shape=(n, n), order=order), + lp.ArrayArg("b", dtype, shape=(n, n), order=order), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + i_reg = 2 + j_reg = 2 + i_chunks = 16 + j_chunks = 16 + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "k", 16, no_slabs=True) + knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner"]) + assert knl.get_problems({})[0] <= 2 + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + c = cl_array.empty_like(a) + refsol = np.dot(a.get(), b.get()) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + + + + +def test_intel_matrix_mul(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = 6*16*16 + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + "c[i, j] = a[i, k]*b[k, j]" + ], + [ + lp.ArrayArg("a", dtype, shape=(n, n), order=order), + lp.ArrayArg("b", dtype, shape=(n, n), order=order), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + i_reg = 4 + j_reg = 4 + i_chunks = 16 + j_chunks = 16 + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "k", 16, no_slabs=True) + #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") + knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) + knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + assert knl.get_problems({})[0] <= 2 + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl, + hints=["k_outer", "k_inner_outer", "k_inner_inner"] + )) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + c = cl_array.empty_like(a) + refsol = np.dot(a.get(), b.get()) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + + + + + +def test_magma_fermi_matrix_mul(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = 6*16*16 + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + "c[i, j] = a[i, k]*b[k, j]" + ], + [ + lp.ImageArg("a", dtype, 2), + lp.ImageArg("b", dtype, 2), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + i_reg = 4 + j_reg = 4 + i_chunks = 16 + j_chunks = 16 + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) + knl = lp.split_dimension(knl, "k", 16, no_slabs=True) + #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") + knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) + knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + assert knl.get_problems({})[0] <= 2 + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl, + hints=["k_outer", "k_inner_outer", "k_inner_inner"] + )) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + a_img = cl.image_from_array(ctx, a.get(), 1) + b_img = cl.image_from_array(ctx, b.get(), 1) + c = cl_array.empty_like(a) + refsol = np.dot(a.get(), b.get()) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, + g_times_l=True) + + if check: + check_error(refsol, c.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + + + + + def test_image_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory()