diff --git a/doc/reference.rst b/doc/reference.rst index e657e47ed8586cb48dd21a173496d14b4d3b3fe7..c8844f4b82b5caf577bfa6b36680fd2cabb78b5e 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -126,7 +126,7 @@ Finishing up Automatic Testing ----------------- -.. autofunction:: auto_test_vs_seq +.. autofunction:: auto_test_ref Troubleshooting --------------- diff --git a/loopy/__init__.py b/loopy/__init__.py index 2342fd9930e938e0d798283b33a10eccbe497a7b..912532ae104c3566c8b792cbb84a33db9dc3e4c7 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -28,14 +28,14 @@ from loopy.cse import precompute from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules from loopy.codegen import generate_code -from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq +from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_ref from loopy.check import check_kernels __all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", "LoopKernel", "get_dot_dependency_graph", "preprocess_kernel", "generate_loop_schedules", "generate_code", - "CompiledKernel", "drive_timing_run", "auto_test_vs_seq", "check_kernels", + "CompiledKernel", "drive_timing_run", "auto_test_vs_ref", "check_kernels", "make_kernel", "split_dimension", "join_dimensions", "tag_dimensions", "extract_subst", "apply_subst", diff --git a/loopy/compiled.py b/loopy/compiled.py index 0b6c9604b7ae7b4d693155745831f7008aea01fb..370573af756e55fe83f84b9aa6f271ff71626308 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -134,7 +134,7 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, # {{{ automatic testing -def make_seq_args(kernel, queue, parameters): +def make_ref_args(kernel, queue, parameters): from loopy.kernel import ScalarArg, ArrayArg, ImageArg from pymbolic import evaluate @@ -185,7 +185,7 @@ def make_seq_args(kernel, queue, parameters): -def make_args(queue, kernel, seq_input_arrays, parameters): +def make_args(queue, kernel, ref_input_arrays, parameters): from loopy.kernel import ScalarArg, ArrayArg, ImageArg from pymbolic import evaluate @@ -209,12 +209,12 @@ def make_args(queue, kernel, seq_input_arrays, parameters): output_arrays.append(ary) result.append(ary.data) else: - seq_arg = seq_input_arrays.pop(0) + ref_arg = ref_input_arrays.pop(0) if isinstance(arg, ImageArg): - result.append(cl.image_from_array(queue.context, seq_arg.get(), 1)) + result.append(cl.image_from_array(queue.context, ref_arg.get(), 1)) else: - ary = cl_array.to_device(queue, seq_arg.get()) + ary = cl_array.to_device(queue, ref_arg.get()) result.append(ary.data) else: @@ -225,12 +225,12 @@ def make_args(queue, kernel, seq_input_arrays, parameters): -def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, - print_seq_code=False, print_code=True, warmup_rounds=2, timing_rounds=100, +def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count, op_label, parameters, + print_ref_code=False, print_code=True, warmup_rounds=2, timing_rounds=100, edit_code=False, dump_binary=False, with_annotation=False): from time import time - # {{{ set up CL context for sequential run + # {{{ set up CL context for reference run last_dev = None last_cpu_dev = None @@ -243,7 +243,7 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, if last_cpu_dev is None: dev = last_dev from warnings import warn - warn("No CPU device found for sequential test, using %s." % dev) + warn("No CPU device found for reference test, using %s." % dev) else: dev = last_cpu_dev @@ -251,45 +251,45 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, # }}} - # {{{ compile and run sequential code + # {{{ compile and run reference code - seq_ctx = cl.Context([dev]) - seq_queue = cl.CommandQueue(seq_ctx, + ref_ctx = cl.Context([dev]) + ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) import loopy as lp - seq_kernel_gen = lp.generate_loop_schedules(seq_knl) - for knl in lp.check_kernels(seq_kernel_gen, {}): - seq_sched_kernel = knl + ref_kernel_gen = lp.generate_loop_schedules(ref_knl) + for knl in lp.check_kernels(ref_kernel_gen, parameters): + ref_sched_kernel = knl break - seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel, + ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel, with_annotation=with_annotation) - if print_seq_code: + if print_ref_code: print "----------------------------------------------------------" - print "Sequential Code:" + print "Reference Code:" print "----------------------------------------------------------" - print_highlighted_code(seq_compiled.code) + print_highlighted_code(ref_compiled.code) print "----------------------------------------------------------" - seq_args, seq_input_arrays, seq_output_arrays = \ - make_seq_args(seq_sched_kernel, seq_queue, parameters) + ref_args, ref_input_arrays, ref_output_arrays = \ + make_ref_args(ref_sched_kernel, ref_queue, parameters) - seq_queue.finish() - seq_start = time() + ref_queue.finish() + ref_start = time() - seq_evt = seq_compiled.cl_kernel(seq_queue, - seq_compiled.global_size_func(**parameters), - seq_compiled.local_size_func(**parameters), - *seq_args, + ref_evt = ref_compiled.cl_kernel(ref_queue, + ref_compiled.global_size_func(**parameters), + ref_compiled.local_size_func(**parameters), + *ref_args, g_times_l=True) - seq_queue.finish() - seq_stop = time() - seq_elapsed_wall = seq_stop-seq_start + ref_queue.finish() + ref_stop = time() + ref_elapsed_wall = ref_stop-ref_start - seq_evt.wait() - seq_elapsed = 1e-9*(seq_evt.profile.END-seq_evt.profile.SUBMIT) + ref_evt.wait() + ref_elapsed = 1e-9*(ref_evt.profile.END-ref_evt.profile.SUBMIT) # }}} @@ -301,7 +301,7 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, args = None for i, kernel in enumerate(kernel_gen): if args is None: - args, output_arrays = make_args(queue, kernel, seq_input_arrays, parameters) + args, output_arrays = make_args(queue, kernel, ref_input_arrays, parameters) compiled = CompiledKernel(ctx, kernel, edit_code=edit_code, with_annotation=with_annotation) @@ -325,8 +325,8 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, evt = compiled.cl_kernel(queue, gsize, lsize, *args, g_times_l=True) if do_check: - for seq_out_ary, out_ary in zip(seq_output_arrays, output_arrays): - assert np.allclose(seq_out_ary.get(), out_ary.get(), + for ref_out_ary, out_ary in zip(ref_output_arrays, output_arrays): + assert np.allclose(ref_out_ary.get(), out_ary.get(), rtol=1e-3, atol=1e-3) do_check = False @@ -365,11 +365,14 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, print "elapsed: %g s event, %s s other-event %g s wall, rate: %g %s/s" % ( elapsed, elapsed_evt_2, elapsed_wall, op_count/elapsed, op_label) - print "seq: elapsed: %g s event, %g s wall, rate: %g %s/s" % ( - seq_elapsed, seq_elapsed_wall, op_count/seq_elapsed, op_label) + print "ref: elapsed: %g s event, %g s wall, rate: %g %s/s" % ( + ref_elapsed, ref_elapsed_wall, op_count/ref_elapsed, op_label) # }}} +from pytools import MovedFunctionDeprecationWrapper + +auto_test_vs_seq = MovedFunctionDeprecationWrapper(auto_test_vs_ref) # }}} diff --git a/loopy/cse.py b/loopy/cse.py index f0a5c395a5df73a3cabab501fdfa04f63af8af37..fb26eec59f0afa36e3fe88e39807856a3f7fab60 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -121,7 +121,7 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, dim_type.out, 0, dup_sweep_index) # compute bounds for each storage axis - storage_domain = bounds_footprint_map.domain() + storage_domain = bounds_footprint_map.domain().coalesce() if not storage_domain.is_bounded(): raise RuntimeError("In precomputation of substitution '%s': " @@ -197,7 +197,7 @@ def get_access_info(kernel, subst_name, aug_domain = stor2sweep.move_dims( dim_type.out, stor_idx, dim_type.in_, 0, - n_stor).range().coalesce() + n_stor).range() # aug_domain space now: # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'] @@ -335,13 +335,16 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], sweep_inames = set() for invdesc in invocation_descriptors: - for iname in sweep_axes: - if iname in subst.arguments: - arg_idx = subst.arguments.index(iname) + for swaxis in sweep_axes: + if isinstance(swaxis, int): + sweep_inames.update( + get_dependencies(invdesc.args[swaxis])) + elif swaxis in subst.arguments: + arg_idx = subst.arguments.index(swaxis) sweep_inames.update( get_dependencies(invdesc.args[arg_idx])) else: - sweep_inames.add(iname) + sweep_inames.add(swaxis) sweep_inames = list(sweep_inames) del sweep_axes @@ -433,19 +436,15 @@ def precompute(kernel, subst_name, dtype, sweep_axes=[], # {{{ ensure convexity of new_domain - new_domain = new_domain.coalesce() - if len(new_domain.get_basic_sets()) > 1: hull_new_domain = new_domain.simple_hull() if hull_new_domain <= new_domain: new_domain = hull_new_domain - if len(new_domain.get_basic_sets()) > 1: - print("Substitution '%s' yielded a footprint that was not " - "obviously convex. Now computing convex hull. " - "This might take a *long* time." % subst_name) + new_domain = new_domain.coalesce() - hull_new_domain = new_domain.convex_hull() + if len(new_domain.get_basic_sets()) > 1: + hull_new_domain = new_domain.simple_hull() if hull_new_domain <= new_domain: new_domain = hull_new_domain diff --git a/loopy/kernel.py b/loopy/kernel.py index 69044e63784c784a7ec77827bd439620d4af3d93..30af589db7089c6c7f5c0c1c6d1ca33b57447663 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -1182,6 +1182,7 @@ class SetOperationCacheManager: if set.plain_is_equal(bkt_set) and op == bkt_op and args == bkt_args: return result + #print op, set.get_dim_name(dim_type.set, args[0]) result = getattr(set, op)(*args) bucket.append((set, op, args, result)) return result diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bef261ba13740ec9984c401f7056ecb6c6df4140..a9e39c9fbd5581b5349833a40ef07888fe2a4345 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -98,7 +98,8 @@ def duplicate_reduction_inames(kernel): from loopy.isl_helpers import duplicate_axes for old, new in zip(old_insn_inames, new_insn_inames): new_domain = duplicate_axes(new_domain, [old], [new]) - new_iname_to_tag[new] = kernel.iname_to_tag[old] + if old in kernel.iname_to_tag: + new_iname_to_tag[new] = kernel.iname_to_tag[old] return kernel.copy( instructions=new_insns, @@ -519,7 +520,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # }}} if axis is None: - new_tag = UnrollTag() + new_tag = None else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 0efc205147009d3095d96f30998b113238ef30c8..3e3a423589d20cc321b8bb1784ba34efc41174b0 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -71,9 +71,11 @@ def test_laplacian_stiffness(ctx_factory): knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) knl = lp.precompute(knl, "dPsi", np.float32, - sweep_inames=["K_inner"]) + sweep_axes=["K_inner"]) knl = lp.add_prefetch(knl, "jacInv", ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) + print lp.preprocess_kernel(knl) + 1/0 return knl #for variant in [variant_1, variant_2, variant_3]: @@ -82,7 +84,7 @@ def test_laplacian_stiffness(ctx_factory): loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"Nc": Nc}, print_seq_code=True, timing_rounds=30) diff --git a/test/test_linalg.py b/test/test_linalg.py index 48040d2aa476ab22cab7dd7d76214ca4d5bd7ad2..ef0e627bd30589ba247c18c47a2046a575ec6407 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -186,7 +186,7 @@ def test_transpose(ctx_factory): kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, {}) - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=dtype.itemsize*n**2*2/1e9, op_label="GByte", parameters={}) @@ -373,7 +373,7 @@ def test_rank_one(ctx_factory): kernel_gen = lp.generate_loop_schedules(variant(knl)) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=np.dtype(dtype).itemsize*n**2/1e9, op_label="GBytes", parameters={"n": n}) @@ -645,7 +645,7 @@ def test_image_matrix_mul_ilp(ctx_factory): kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=2*n**3/1e9, op_label="GFlops", parameters={}) diff --git a/test/test_sem.py b/test/test_sem.py index d42e030ecfb79844efc4f99fa74459c19f9f913e..a83a74c90e8ea0877637e2d8f67e102caa50b6a2 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -102,7 +102,7 @@ def test_laplacian(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True) @@ -175,7 +175,7 @@ def test_laplacian_lmem(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True) @@ -184,6 +184,74 @@ def test_laplacian_lmem(ctx_factory): +def test_laplacian_lmem_ilp(ctx_factory): + # This does not lead to practical/runnable code (out of lmem), but it's an + # excellent stress test for the code generator. :) + + dtype = np.float32 + ctx = ctx_factory() + order = "C" + + n = 8 + + from pymbolic import var + K_sym = var("K") + + field_shape = (K_sym, n, n, n) + + # K - run-time symbolic + knl = lp.make_kernel(ctx.devices[0], + "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, + [ + "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", + "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", + "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", + + "lap[e,i,j,k] = " + " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" + "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" + "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" + ], + [ + lp.ArrayArg("u", dtype, shape=field_shape, order=order), + lp.ArrayArg("lap", dtype, shape=field_shape, order=order), + lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order), + lp.ArrayArg("D", dtype, shape=(n, n), order=order), + lp.ScalarArg("K", np.int32, approximately=1000), + ], + name="semlap", assumptions="K>=1") + + + # Must act on u first, otherwise stencil becomes crooked and + # footprint becomes non-convex. + + knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) + knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp") + + knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) + + knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) + knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) + knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) + + knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) + knl = lp.add_prefetch(knl, "D", ["m", "j"]) + + #print seq_knl + #1/0 + + knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) + + for knl in kernel_gen: + print lp.generate_code(knl) + + + + + def test_advect(ctx_factory): 1/0 # not ready @@ -270,7 +338,7 @@ def test_advect(ctx_factory): K = 1000 - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,) @@ -390,7 +458,7 @@ def test_advect_dealias(ctx_factory): K = 1000 - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,) @@ -452,7 +520,7 @@ def test_interp_diff(ctx_factory): kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)