From 55c79f2861eec25196e9b0d4f06c4575cdb8b8cd Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 21 Jul 2011 12:02:49 -0500
Subject: [PATCH] Make a way to pass in CL build options.

---
 examples/matrix-ops.py | 20 ++++++++++++++------
 loopy/__init__.py      | 15 +++++++++------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index 5bebaefc2..a8cd27539 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -7,6 +7,9 @@ import loopy as lp
 
 
 
+FAST_OPTIONS = ["-cl-mad-enable", "-cl-fast-relaxed-math", 
+        "-cl-no-signed-zeros", "-cl-strict-aliasing"]
+
 def make_well_conditioned_dev_matrix(queue, n, dtype=np.float32, order="C"):
     return cl_array.to_device(queue,
             np.asarray(np.random.randn(n, n) + 5*np.eye(n),
@@ -74,7 +77,8 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context):
 
         return evt
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
+            options=FAST_OPTIONS)
 
 
 
@@ -97,14 +101,14 @@ def image_matrix_mul(ctx_factory=cl.create_some_context):
                 ],
             [
                 lp.ImageArg("a", dtype, 2),
-                lp.ArrayArg("b", dtype, shape=(n, n), order=order),
+                lp.ImageArg("b", dtype, 2),
                 lp.ArrayArg("c", dtype, shape=(n, n), order=order),
                 ],
             name="matmul")
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 4)
+    knl = lp.split_dimension(knl, "k", 16)
     knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ])
     assert knl.get_invalid_reason() is None
@@ -116,9 +120,11 @@ def image_matrix_mul(ctx_factory=cl.create_some_context):
     b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order)
     c = cl_array.empty_like(a)
     refsol = np.dot(a.get(), b.get())
+    a_img = cl.image_from_array(ctx, a.get(), 1)
+    b_img = cl.image_from_array(ctx, b.get(), 1)
 
     def launcher(kernel, gsize, lsize, check):
-        evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data,
+        evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data,
                 g_times_l=True)
 
         if check:
@@ -126,7 +132,8 @@ def image_matrix_mul(ctx_factory=cl.create_some_context):
 
         return evt
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
+            options=FAST_OPTIONS)
 
 
 
@@ -180,7 +187,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context):
 
         return evt
 
-    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3)
+    lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3,
+            options=FAST_OPTIONS)
 
 
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 48dd2ac41..5dab8596f 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -23,9 +23,11 @@ register_mpz_with_pymbolic()
 
 
 # TODO: Wrong 19
+# TODO: Restrict on/off
 # TODO: Try, fix reg. prefetch
+# TODO: 1D local arrays
+# TODO: doubles in textures? as_double
 # TODO: Divisibility
-# TODO: nD Texture access
 # TODO: Functions
 # TODO: Common subexpressions
 # TODO: Try different kernels
@@ -1163,7 +1165,7 @@ class LoopyCCodeMapper(CCodeMapper):
                 return ("read_imagef(%s, loopy_sampler, (float%d)(%s)).x"
                         % (arg.name, arg.dimensions, 
                             ", ".join(self.rec(idx, PREC_NONE) 
-                                for idx in expr.index)))
+                                for idx in expr.index[::-1])))
             else:
                 # ArrayArg
                 index_expr = expr.index
@@ -2066,7 +2068,7 @@ def add_prefetch(kernel, input_access_descr, tags_or_inames, loc_fetch_axes={}):
 
 
 class CompiledKernel:
-    def __init__(self, context, kernel, size_args=None):
+    def __init__(self, context, kernel, size_args=None, options=[]):
         self.kernel = kernel
         self.code = generate_code(kernel)
 
@@ -2074,7 +2076,7 @@ class CompiledKernel:
         #self.code = invoke_editor(self.code)
 
         self.cl_kernel = getattr(
-                cl.Program(context, self.code).build(),
+                cl.Program(context, self.code).build(options=options),
                 kernel.name)
 
         arg_types = []
@@ -2115,7 +2117,8 @@ class CompiledKernel:
 
 
 # driver ----------------------------------------------------------------------
-def drive_timing_run(kernel_generator, queue, launch, flop_count=None):
+def drive_timing_run(kernel_generator, queue, launch, flop_count=None,
+        options=[]):
 
     def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5):
         check = True
@@ -2139,7 +2142,7 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None):
     soln_count = 0
     for kernel in kernel_generator:
 
-        compiled = CompiledKernel(queue.context, kernel)
+        compiled = CompiledKernel(queue.context, kernel, options=options)
 
         print "-----------------------------------------------"
         print "SOLUTION #%d" % soln_count
-- 
GitLab