diff --git a/test.py b/test.py index 0a73b472fe00649a513dfe68b3078548658b50f4..93890bcff9472def502c033cf228c3d563e14b84 100644 --- a/test.py +++ b/test.py @@ -2,6 +2,7 @@ import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array # noqa +import pyopencl.tools # noqa import pyopencl.clrandom # noqa import loopy as lp # noqa import sys @@ -48,6 +49,11 @@ def test_compute_flux_derivatives(ctx_factory): metric_jacobians=metric_jacobians) +def f_array(queue, *shape): + ary = np.random.random_sample(shape).astype(np.float32).copy(order="F") + return cl.array.to_device(queue, ary) + + def get_gpu_transformed_weno(): prg = f.prg @@ -62,7 +68,7 @@ def get_gpu_transformed_weno(): cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", lp.AddressSpace.GLOBAL) - for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6"]: + for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: cfd = lp.split_iname(cfd, "i"+suffix, 16, outer_tag="g.0", inner_tag="l.0") cfd = lp.split_iname(cfd, "j"+suffix, 16, @@ -106,10 +112,10 @@ def test_compute_flux_derivatives_gpu(ctx_factory): ny = 10 nz = 10 - states = f.random_array(nvars, nx+6, ny+6, nz+6) - fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6) - metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f.random_array(nx+6, ny+6, nz+6) + states = f_array(queue, nvars, nx+6, ny+6, nz+6) + fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) + metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) + metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, nz+6), dtype=np.float32, order="F") @@ -120,6 +126,13 @@ def test_compute_flux_derivatives_gpu(ctx_factory): with open("gen-code.cl", "w") as outf: outf.write(lp.generate_code_v2(prg).device_code()) + prg = lp.set_options(prg, no_numpy=True) + + prg(queue, nvars=nvars, ndim=ndim, + states=states, fluxes=fluxes, metrics=metrics, + metric_jacobians=metric_jacobians, + flux_derivatives=flux_derivatives_dev) + prg(queue, nvars=nvars, ndim=ndim, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians, @@ -135,15 +148,17 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory): ndim = 3 nvars = 5 - n = 100 + n = 16*16 nx = n ny = n nz = n - states = f.random_array(nvars, nx+6, ny+6, nz+6) - fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6) - metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f.random_array(nx+6, ny+6, nz+6) + print("ARRAY GEN") + states = f_array(queue, nvars, nx+6, ny+6, nz+6) + fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) + metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) + metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) + print("END ARRAY GEN") flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, nz+6), dtype=np.float32, order="F") @@ -154,15 +169,34 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory): with open("gen-code.cl", "w") as outf: outf.write(lp.generate_code_v2(prg).device_code()) - prg = lp.set_options(prg, ignore_boostable_into=True, write_wrapper=True) + prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) + prg = lp.set_options(prg, ignore_boostable_into=True) + prg = lp.set_options(prg, no_numpy=True) + #prg = lp.set_options(prg, write_wrapper=True) #op_map = lp.get_op_map(prg, count_redundant_work=False) #print(op_map) + allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) + from functools import partial run = partial(prg, queue, nvars=nvars, ndim=ndim, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev) + flux_derivatives=flux_derivatives_dev, + allocator=allocator) + + # {{{ monkeypatch enqueue_nd_range_kernel to trace + + if 0: + old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel + + def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): + print(f"Enqueueing {ker.function_name}") + return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) + + cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper + + # }}} print("warmup") for iwarmup_round in range(2): @@ -181,7 +215,9 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory): queue.finish() one_round = (time() - start)/nrounds - print(f"DOFs/s: {n**3/one_round}, elapsed per round: {one_round} s") + print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") + print(f"elapsed per round: {one_round} s") + print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") # This lets you run 'python test.py test_case(cl._csc)' without pytest.