diff --git a/test/test_linalg.py b/test/test_linalg.py index c44ce8ddc6096dfb196bd77aa706f625f401736c..b8c5cb7a19b4ec9be27f8ce3260f5f0f4902ddd3 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -150,6 +150,54 @@ def test_axpy(ctx_factory): +def test_transpose(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = get_suitable_size(ctx) + + knl = lp.make_kernel(ctx.devices[0], + "{[i,j]: 0<=i,j<%d}" % n, + [ + "b[i, j] = a[j, i]" + ], + [ + lp.ArrayArg("a", dtype, shape=(n, n), order=order), + lp.ArrayArg("b", dtype, shape=(n, n), order=order), + ], + name="transpose") + + knl = lp.split_dimension(knl, "i", 16, + outer_tag="g.0", inner_tag="l.1") + knl = lp.split_dimension(knl, "j", 16, + outer_tag="g.1", inner_tag="l.0") + knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, {}, kill_level_min=5) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + b = cl_array.empty_like(a) + refsol = a.get().T.copy() + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a.data, b.data, + g_times_l=True) + + if check: + check_error(refsol, b.get()) + + return evt + + lp.drive_timing_run(kernel_gen, queue, launcher, 0) + + + + def test_plain_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() @@ -838,47 +886,7 @@ def main_elwise_scaled_matrix_mul(): -def main_transpose(): - n = 16*48 - from pymbolic import var - a, b, i, j = [var(s) for s in "abij"] - - k = make_loop_kernel([ - LoopDimension("i", n), - LoopDimension("j", n), - ], [ - (b[i+n*j], a[j+n*i]) - ]) - - gen_kwargs = { - "min_threads": 128, - "min_blocks": 32, - } - - if True and HAVE_CUDA: - if HAVE_CUDA: - a = curandom.rand((n, n)) - b = gpuarray.empty_like(a) - - def launcher(grid, kernel, texref_lookup): - a.bind_to_texref_ext(texref_lookup["a"]) - kernel.prepared_call(grid, b.gpudata) - - drive_timing_run( - generate_all_kernels(k, **gen_kwargs), - launcher, 0) - else: - show_kernel_codes(generate_all_kernels(k, **gen_kwargs)) - - - - - if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the - # tests. - import pyopencl as cl - import sys if len(sys.argv) > 1: exec(sys.argv[1])