From 55c79f2861eec25196e9b0d4f06c4575cdb8b8cd Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 21 Jul 2011 12:02:49 -0500 Subject: [PATCH] Make a way to pass in CL build options. --- examples/matrix-ops.py | 20 ++++++++++++++------ loopy/__init__.py | 15 +++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index 5bebaefc2..a8cd27539 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -7,6 +7,9 @@ import loopy as lp +FAST_OPTIONS = ["-cl-mad-enable", "-cl-fast-relaxed-math", + "-cl-no-signed-zeros", "-cl-strict-aliasing"] + def make_well_conditioned_dev_matrix(queue, n, dtype=np.float32, order="C"): return cl_array.to_device(queue, np.asarray(np.random.randn(n, n) + 5*np.eye(n), @@ -74,7 +77,8 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): return evt - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, + options=FAST_OPTIONS) @@ -97,14 +101,14 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): ], [ lp.ImageArg("a", dtype, 2), - lp.ArrayArg("b", dtype, shape=(n, n), order=order), + lp.ImageArg("b", dtype, 2), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 4) + knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) assert knl.get_invalid_reason() is None @@ -116,9 +120,11 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) + a_img = cl.image_from_array(ctx, a.get(), 1) + b_img = cl.image_from_array(ctx, b.get(), 1) def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, + evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, g_times_l=True) if check: @@ -126,7 +132,8 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): return evt - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, + options=FAST_OPTIONS) @@ -180,7 +187,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): return evt - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, + options=FAST_OPTIONS) diff --git a/loopy/__init__.py b/loopy/__init__.py index 48dd2ac41..5dab8596f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -23,9 +23,11 @@ register_mpz_with_pymbolic() # TODO: Wrong 19 +# TODO: Restrict on/off # TODO: Try, fix reg. prefetch +# TODO: 1D local arrays +# TODO: doubles in textures? as_double # TODO: Divisibility -# TODO: nD Texture access # TODO: Functions # TODO: Common subexpressions # TODO: Try different kernels @@ -1163,7 +1165,7 @@ class LoopyCCodeMapper(CCodeMapper): return ("read_imagef(%s, loopy_sampler, (float%d)(%s)).x" % (arg.name, arg.dimensions, ", ".join(self.rec(idx, PREC_NONE) - for idx in expr.index))) + for idx in expr.index[::-1]))) else: # ArrayArg index_expr = expr.index @@ -2066,7 +2068,7 @@ def add_prefetch(kernel, input_access_descr, tags_or_inames, loc_fetch_axes={}): class CompiledKernel: - def __init__(self, context, kernel, size_args=None): + def __init__(self, context, kernel, size_args=None, options=[]): self.kernel = kernel self.code = generate_code(kernel) @@ -2074,7 +2076,7 @@ class CompiledKernel: #self.code = invoke_editor(self.code) self.cl_kernel = getattr( - cl.Program(context, self.code).build(), + cl.Program(context, self.code).build(options=options), kernel.name) arg_types = [] @@ -2115,7 +2117,8 @@ class CompiledKernel: # driver ---------------------------------------------------------------------- -def drive_timing_run(kernel_generator, queue, launch, flop_count=None): +def drive_timing_run(kernel_generator, queue, launch, flop_count=None, + options=[]): def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5): check = True @@ -2139,7 +2142,7 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None): soln_count = 0 for kernel in kernel_generator: - compiled = CompiledKernel(queue.context, kernel) + compiled = CompiledKernel(queue.context, kernel, options=options) print "-----------------------------------------------" print "SOLUTION #%d" % soln_count -- GitLab