diff --git a/loopy/__init__.py b/loopy/__init__.py index 3b80b4689d5e80c340e3a25b74722331ccad7de2..3e686fff006baafc17c3195c8f7127bc4e30d801 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -26,7 +26,7 @@ from loopy.kernel import AutoFitLocalIndexTag from loopy.cse import realize_cse from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules -from loopy.compiled import CompiledKernel, drive_timing_run +from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq from loopy.check import check_kernels __all__ = ["ScalarArg", "ArrayArg", "ImageArg", diff --git a/loopy/compiled.py b/loopy/compiled.py index 9b976aa186600ac6f511e72d7fcc58342566ea69..51dfd86c204a042d5148439e4c3432427cf2aec1 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -1,5 +1,8 @@ from __future__ import division import pyopencl as cl +import pyopencl.array as cl_array + +import numpy as np @@ -63,6 +66,19 @@ class CompiledKernel: +def print_highlighted_code(text): + try: + from pygments import highlight + except ImportError: + print compiled.code + else: + from pygments.lexers import CLexer + from pygments.formatters import TerminalFormatter + + print highlight(text, CLexer(), TerminalFormatter()) + + + # {{{ timing driver @@ -98,16 +114,7 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, print "SOLUTION #%d" % soln_count print "-----------------------------------------------" if print_code: - try: - from pygments import highlight - except ImportError: - print compiled.code - else: - from pygments.lexers import CLexer - from pygments.formatters import TerminalFormatter - - print highlight(compiled.code, CLexer(), TerminalFormatter()) - + print_highlighted_code(compiled.code) print "-----------------------------------------------" elapsed = time_run(compiled) @@ -123,3 +130,159 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, print "%d solutions" % soln_count # }}} + +# {{{ automatic testing + +def make_seq_args(kernel, queue, parameters): + from loopy.kernel import ScalarArg, ArrayArg + + from pymbolic import evaluate + + result = [] + input_arrays = [] + output_arrays = [] + + for arg in kernel.args: + if isinstance(arg, ScalarArg): + result.append(arg.dtype(parameters[arg.name])) + + elif isinstance(arg, ArrayArg): + if arg.shape is None: + raise ValueError("arrays need known shape to use automatic " + "testing") + + shape = evaluate(arg.shape, parameters) + ary = cl_array.empty(queue, shape, arg.dtype, order=arg.order) + assert arg.offset == 0 + if arg.name in kernel.get_written_variables(): + ary.fill(-17) + output_arrays.append(ary) + else: + from pyopencl.clrandom import fill_rand + fill_rand(ary, luxury=2) + input_arrays.append(ary) + result.append(ary.data) + + else: + raise RuntimeError("arg type not understood") + + return result, input_arrays, output_arrays + + + + +def make_args(queue, kernel, seq_input_arrays, parameters): + from loopy.kernel import ScalarArg, ArrayArg + + from pymbolic import evaluate + + result = [] + output_arrays = [] + for arg in kernel.args: + if isinstance(arg, ScalarArg): + result.append(arg.dtype(parameters[arg.name])) + + if isinstance(arg, ArrayArg): + if arg.name in kernel.get_written_variables(): + shape = evaluate(arg.shape, parameters) + ary = cl_array.empty(queue, shape, arg.dtype, order=arg.order) + assert arg.offset == 0 + output_arrays.append(ary) + else: + seq_arg = seq_input_arrays.pop(0) + ary = cl_array.to_device(queue, seq_arg.get()) + + result.append(ary.data) + + return result, output_arrays + + + + +def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, + print_seq_code=False, print_code=True): + # {{{ set up CL context for sequential run + last_dev = None + last_cpu_dev = None + + for pf in cl.get_platforms(): + for dev in pf.get_devices(): + last_dev = dev + if dev.type == cl.device_type.CPU: + last_cpu_dev = dev + + if last_cpu_dev is None: + dev = last_dev + from warnings import warn + warn("No CPU device found for sequential test, using %s." % dev) + else: + dev = last_cpu_dev + + print "using", dev + + # }}} + + # {{{ compile and run sequential code + + seq_ctx = cl.Context([dev]) + seq_queue = cl.CommandQueue(seq_ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + import loopy as lp + seq_kernel_gen = lp.generate_loop_schedules(seq_knl) + for knl in lp.check_kernels(seq_kernel_gen, {}): + seq_sched_kernel = knl + break + + seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel) + if print_seq_code: + print "----------------------------------------------------------" + print "Sequential Code:" + print "----------------------------------------------------------" + print_highlighted_code(seq_compiled.code) + print "----------------------------------------------------------" + + seq_args, seq_input_arrays, seq_output_arrays = \ + make_seq_args(seq_sched_kernel, seq_queue, parameters) + + seq_evt = seq_compiled.cl_kernel(seq_queue, + seq_compiled.global_size_func(**parameters), + seq_compiled.local_size_func(**parameters), + *seq_args, + g_times_l=True) + + # }}} + + # {{{ compile and run parallel code + + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + args = None + for i, kernel in enumerate(kernel_gen): + if args is None: + args, output_arrays = make_args(queue, kernel, seq_input_arrays, parameters) + + compiled = CompiledKernel(ctx, kernel) + if print_code: + print "----------------------------------------------------------" + print "Kernel #%d:" % i + print "----------------------------------------------------------" + print_highlighted_code(compiled.code) + print "----------------------------------------------------------" + + evt = compiled.cl_kernel(queue, + compiled.global_size_func(**parameters), + compiled.local_size_func(**parameters), + *args, + g_times_l=True) + + for seq_out_ary, out_ary in zip(seq_output_arrays, output_arrays): + assert np.allclose(seq_out_ary.get(), out_ary.get()) + + # }}} + + +# }}} + +# vim foldmethod=marker diff --git a/loopy/kernel.py b/loopy/kernel.py index 04bce3bad9da15dd14c20d5e63c3276c86750fbc..2a56ff7181c626002a8a1ab06d88908d503639d6 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -148,6 +148,8 @@ class ArrayArg: self.strides = strides self.offset = offset + self.shape = shape + self.order = order self.constant_mem = constant_mem diff --git a/test/test_linalg.py b/test/test_linalg.py index f5e88286a9ee8feb2868423ccb41382bbc8d52d8..8a21717d8df68044c5b0cd6226aa73345f277e7f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -151,7 +151,7 @@ def test_axpy(ctx_factory): def test_transpose(ctx_factory): - dtype = np.float32 + dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" queue = cl.CommandQueue(ctx, @@ -170,6 +170,8 @@ def test_transpose(ctx_factory): ], name="transpose") + seq_knl = knl + knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, @@ -192,7 +194,10 @@ def test_transpose(ctx_factory): return evt - lp.drive_timing_run(kernel_gen, queue, launcher, 0) + #lp.drive_timing_run(kernel_gen, queue, launcher, 0) + lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + op_count=dtype.itemsize*n**2*2/1e9, op_label="GByte", + parameters={}, print_seq_code=True) @@ -707,11 +712,11 @@ def test_fancy_matrix_mul(ctx_factory): lp.ArrayArg("b", dtype, shape="(n, n)", order=order), lp.ArrayArg("c", dtype, shape="(n, n)", order=order), lp.ScalarArg("n", np.int32, approximately=1000), - ], name="fancy_matmul") + ], name="fancy_matmul", assumptions="n>=1") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 16) + knl = lp.split_dimension(knl, "k", 16, slabs=(0,1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) diff --git a/test/test_sem.py b/test/test_sem.py index 3d3eaa213466f0f75ce112d0ab1eb961c1d0a1c9..f7af36bf437c64347ca89735f6fceaffd8cf5278 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -288,9 +288,9 @@ def test_sem_3d(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m]: 0<=i,j,k,m<%d and 0<=e ur[i,j,k] = sum_float32(m, D[i,m]*u[e,m,j,k])", - "[|i,j,k] us[i,j,k] = sum_float32(m, D[j,m]*u[e,i,m,k])", - "[|i,j,k] ut[i,j,k] = sum_float32(m, D[k,m]*u[e,i,j,m])", + "[|i,j,k,m] ur[i,j,k] = sum_float32(m, D[i,m]*u[e,m,j,k])", + "[|i,j,k,m] us[i,j,k] = sum_float32(m, D[j,m]*u[e,i,m,k])", + "[|i,j,k:ilp,m] ut[i,j,k] = sum_float32(m, D[k,m]*u[e,i,j,m])", "lap[i,j,k,e] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur[m,j,k] + G[1,e,m,j,k]*us[m,j,k] + G[2,e,m,j,k]*ut[m,j,k]))" @@ -317,7 +317,7 @@ def test_sem_3d(ctx_factory): #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k"]) #knl = lp.realize_cse(knl, "build_ur", np.float32, ["j", "k", "mp"]) knl = lp.preprocess_kernel(knl) - #print knl + print knl #1/0 kernel_gen = lp.generate_loop_schedules(knl)