diff --git a/loopy/__init__.py b/loopy/__init__.py index 64f426dc9a54736811b4f3ed18eb2a565fa36efe..390f9996e36e4b4195eb167e30e430512c6fc623 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -10,9 +10,14 @@ register_mpz_with_pymbolic() +# Immediately: +# ------------ +# TODO: Imitate codegen bulk slab handling in bulk slab trials +# For writeup: +# ------------ # TODO: Try, fix reg. prefetch (DG example) / CSEs -# ILP and reg. prefetch (might) interact! +# ILP and reg. prefetch interact! # TODO: Custom reductions per red. axis # TODO: Functions # TODO: Common subexpressions @@ -21,16 +26,20 @@ register_mpz_with_pymbolic() # FIXME: write names should be assigned during scheduling # TODO: Divisibility -# TODO: Try different kernels -# TODO: - Tricky: Convolution, FD # TODO: Try, fix indirect addressing # TODO: More user control for slab opt -# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) +# TODO: Implement GT200 matmul, Fermi matmul, DG +# TODO: DMA engine threads? + +# Later: +# ------ +# TODO: Try different kernels +# TODO: - Tricky: Convolution, FD +# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) # TODO: implement efficient ceil_div? (as opposed to floor_div) # TODO: why are corner cases inefficient? # TODO: Use gists (why do disjoint sets arise?) -# TODO: Imitate codegen bulk slab handling in bulk slab trials diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py index 872aeca776c64953788099a61137c81953c1a5b0..7c799a84c3b5026ac12a4f5d6e4be5353aa1cf65 100644 --- a/loopy/codegen/loop_dim.py +++ b/loopy/codegen/loop_dim.py @@ -141,7 +141,12 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): assert lower_kind == ">=" assert upper_kind == "<" - success, length = kernel.domain.project_out_except([iname], [dim_type.set]).count() + proj_domain = (kernel.domain + .project_out_except([iname], [dim_type.set]) + .project_out_except([], [dim_type.param]) + .remove_divs()) + assert proj_domain.is_bounded() + success, length = proj_domain.count() assert success == 0 def generate_idx_eq_slabs(): diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index e7d880cf287da5b8da5820d73cb578e61140f648..22af66d33b52e22d4d877440a60eb3af2619da59 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -413,11 +413,11 @@ def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): iname_lwr, iname_upr = pf.dim_bounds_by_iname[iname] new_block.append(Comment(" %s [%d..%d)" % (iname, iname_lwr, iname_upr))) new_block.append(Comment(" using:")) - for realiz_iname in realiz_inames: - if realiz_iname is None: - new_block.append(Comment(" loop")) - else: + if realiz_inames is None: + new_block.append(Comment(" loop")) + else: + for realiz_iname in realiz_inames: rd_iname_descr = "loop" iname_lwr, iname_upr, iname_eq = flnd.kernel.get_bounds(realiz_iname, (realiz_iname,), allow_parameters=False) diff --git a/test/test_matmul.py b/test/test_matmul.py index 63d82d7ff462e3c28cb7a314f2b79a4ea311ddeb..1c6210f0d9c5c5e10636ae718ae04707c474693b 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -79,6 +79,63 @@ def get_suitable_size(ctx): +def test_axpy(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = get_suitable_size(ctx) + from pymbolic import var + x, y, z, n_sym, i = [var(s) for s in "xyzni"] + + n_approx = 10**6 + + knl = lp.LoopKernel(ctx.devices[0], + "[n] -> {[i]: 0<=i<n}", + [ + (z[i], x[i]+y[i]) # FIXME: Add scalars + ], + [ + lp.ArrayArg("x", dtype, shape=(n,)), + lp.ArrayArg("y", dtype, shape=(n,)), + lp.ArrayArg("z", dtype, shape=(n,)), + lp.ScalarArg("n", np.int32, approximately=n_approx), + ], + name="matmul") + + unroll = 16 + block_size = 256 + knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0") + knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") + assert knl.get_problems({"n": n_approx})[0] <= 2 + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) + + #a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + #b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + #c = cl_array.empty_like(a) + #refsol = np.dot(a.get(), b.get()) + + def launcher(kernel, gsize, lsize, check): + #evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, + #g_times_l=True) + + #if check: + #check_error(refsol, c.get()) + + #return evt + 1/0 + + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, + edit_code=True) + + + + + def test_plain_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() @@ -104,7 +161,7 @@ def test_plain_matrix_mul(ctx_factory): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 4) + knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) assert knl.get_problems({})[0] <= 2