From 125a5286401901eb22ea729d1971fa2754f5ad4b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 24 Aug 2011 14:23:40 +0200 Subject: [PATCH] Make axpy work. --- loopy/__init__.py | 19 +++++++++---- loopy/codegen/loop_dim.py | 7 ++++- loopy/codegen/prefetch.py | 8 +++--- test/test_matmul.py | 59 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 11 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 64f426dc9..390f9996e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -10,9 +10,14 @@ register_mpz_with_pymbolic() +# Immediately: +# ------------ +# TODO: Imitate codegen bulk slab handling in bulk slab trials +# For writeup: +# ------------ # TODO: Try, fix reg. prefetch (DG example) / CSEs -# ILP and reg. prefetch (might) interact! +# ILP and reg. prefetch interact! # TODO: Custom reductions per red. axis # TODO: Functions # TODO: Common subexpressions @@ -21,16 +26,20 @@ register_mpz_with_pymbolic() # FIXME: write names should be assigned during scheduling # TODO: Divisibility -# TODO: Try different kernels -# TODO: - Tricky: Convolution, FD # TODO: Try, fix indirect addressing # TODO: More user control for slab opt -# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) +# TODO: Implement GT200 matmul, Fermi matmul, DG +# TODO: DMA engine threads? + +# Later: +# ------ +# TODO: Try different kernels +# TODO: - Tricky: Convolution, FD +# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) # TODO: implement efficient ceil_div? (as opposed to floor_div) # TODO: why are corner cases inefficient? # TODO: Use gists (why do disjoint sets arise?) -# TODO: Imitate codegen bulk slab handling in bulk slab trials diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py index 872aeca77..7c799a84c 100644 --- a/loopy/codegen/loop_dim.py +++ b/loopy/codegen/loop_dim.py @@ -141,7 +141,12 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): assert lower_kind == ">=" assert upper_kind == "<" - success, length = kernel.domain.project_out_except([iname], [dim_type.set]).count() + proj_domain = (kernel.domain + .project_out_except([iname], [dim_type.set]) + .project_out_except([], [dim_type.param]) + .remove_divs()) + assert proj_domain.is_bounded() + success, length = proj_domain.count() assert success == 0 def generate_idx_eq_slabs(): diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index e7d880cf2..22af66d33 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -413,11 +413,11 @@ def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): iname_lwr, iname_upr = pf.dim_bounds_by_iname[iname] new_block.append(Comment(" %s [%d..%d)" % (iname, iname_lwr, iname_upr))) new_block.append(Comment(" using:")) - for realiz_iname in realiz_inames: - if realiz_iname is None: - new_block.append(Comment(" loop")) - else: + if realiz_inames is None: + new_block.append(Comment(" loop")) + else: + for realiz_iname in realiz_inames: rd_iname_descr = "loop" iname_lwr, iname_upr, iname_eq = flnd.kernel.get_bounds(realiz_iname, (realiz_iname,), allow_parameters=False) diff --git a/test/test_matmul.py b/test/test_matmul.py index 63d82d7ff..1c6210f0d 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -79,6 +79,63 @@ def get_suitable_size(ctx): +def test_axpy(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = get_suitable_size(ctx) + from pymbolic import var + x, y, z, n_sym, i = [var(s) for s in "xyzni"] + + n_approx = 10**6 + + knl = lp.LoopKernel(ctx.devices[0], + "[n] -> {[i]: 0<=i