From 321e5440c7ac80d2a8ddb3f404e174a28edf572d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Wed, 20 Jul 2011 23:39:23 -0500 Subject: [PATCH] Only use restrict pointers on non-NV platforms, plus minor tweaks. --- examples/matrix-ops.py | 39 ++++++++++++++++++++++++--------------- loopy/__init__.py | 19 ++++++++++++++++--- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index 81da59980..ac383546b 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -7,9 +7,10 @@ import loopy as lp -def make_well_condition_dev_matrix(queue, n, dtype=np.float32): +def make_well_condition_dev_matrix(queue, n, dtype=np.float32, order="C"): return cl_array.to_device(queue, - np.random.randn(n, n).astype(dtype) + 5*np.eye(n, dtype=dtype)) + np.asarray(np.random.randn(n, n) + 5*np.eye(n), + dtype=dtype, order=order)) @@ -17,10 +18,11 @@ def make_well_condition_dev_matrix(queue, n, dtype=np.float32): def plain_matrix_mul(ctx_factory=cl.create_some_context): dtype = np.float32 ctx = ctx_factory() + order = "C" queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = 16*10 + n = 16*100 from pymbolic import var a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] @@ -30,24 +32,24 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): (c[i, j], a[i, k]*b[k, j]) ], [ - lp.ArrayArg("a", dtype, shape=(n, n)), - lp.ArrayArg("b", dtype, shape=(n, n)), - lp.ArrayArg("c", dtype, shape=(n, n)), + lp.ArrayArg("a", dtype, shape=(n, n), order=order), + lp.ArrayArg("b", dtype, shape=(n, n), order=order), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 16) - knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) + knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) assert knl.get_invalid_reason() is None kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_condition_dev_matrix(queue, n, dtype=dtype) - b = make_well_condition_dev_matrix(queue, n, dtype=dtype) + a = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) @@ -58,7 +60,12 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context): if check: sol = c.get() rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") - assert rel_err < 1e-5, rel_err + if rel_err > 1e-5: + import matplotlib.pyplot as pt + pt.imshow(refsol-sol) + pt.colorbar() + pt.show() + raise RuntimeError("check failed") return evt @@ -73,6 +80,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + order = "F" + n = 16*30 from pymbolic import var a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] @@ -99,8 +108,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_condition_dev_matrix(queue, n, dtype=dtype) - b = make_well_condition_dev_matrix(queue, n, dtype=dtype) + a = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) + b = make_well_condition_dev_matrix(queue, n, dtype=dtype, order=order) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) @@ -110,9 +119,9 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): if check: sol = c.get() - import matplotlib.pyplot as pt rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") if rel_err > 1e-5: + import matplotlib.pyplot as pt pt.imshow(refsol-sol) pt.colorbar() pt.show() @@ -208,4 +217,4 @@ if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: - fancy_matrix_mul() + plain_matrix_mul() diff --git a/loopy/__init__.py b/loopy/__init__.py index d7cb5a639..563bace7c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1798,8 +1798,8 @@ class CodeGenerationState(Record): __slots__ = ["c_code_mapper", "try_slab_partition"] def generate_code(kernel): - from cgen import (FunctionBody, FunctionDeclaration, \ - POD, Value, RestrictPointer, ArrayOf, Module, Block, + from cgen import (FunctionBody, FunctionDeclaration, + POD, Value, ArrayOf, Module, Block, Define, Line, Const, LiteralLines) from cgen.opencl import CLKernel, CLGlobal, CLRequiredWorkGroupSize, CLLocal @@ -1850,10 +1850,19 @@ def generate_code(kernel): has_double = False + def restrict_ptr_if_not_nvidia(arg): + from cgen import Pointer, RestrictPointer + + if "nvidia" in kernel.device.platform.name.lower(): + return Pointer(arg) + else: + return RestrictPointer(arg) + args = [] for arg in kernel.args: if isinstance(arg, ArrayArg): - arg_decl = RestrictPointer(POD(arg.dtype, arg.name)) + arg_decl = restrict_ptr_if_not_nvidia( + POD(arg.dtype, arg.name)) if arg_decl.name in kernel.input_vectors(): arg_decl = Const(arg_decl) arg_decl = CLGlobal(arg_decl) @@ -2016,6 +2025,10 @@ class CompiledKernel: def __init__(self, context, kernel, size_args=None): self.kernel = kernel self.code = generate_code(kernel) + + #from pytools import invoke_editor + #self.code = invoke_editor(self.code) + self.cl_kernel = getattr( cl.Program(context, self.code).build(), kernel.name) -- GitLab