diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..f87a2197a5a307475eab124c61bb6489d1d689d4 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,106 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array # noqa +import pyopencl.tools # noqa +import pyopencl.clrandom # noqa +import loopy as lp # noqa +import sys + +import logging + +import pytest +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +import fixtures + +def benchmark_compute_flux_derivatives_gpu(ctx_factory): + logging.basicConfig(level="INFO") + + prg = fixtures.get_gpu_transformed_weno() + + queue = fixtures.get_queue(ctx_factory) + + ndim = 3 + nvars = 5 + n = 16*16 + nx = n + ny = n + nz = n + + print("ARRAY GEN") + states = fixtures.f_array(queue, nvars, nx+6, ny+6, nz+6) + fluxes = fixtures.f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) + metrics = fixtures.f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) + metric_jacobians = fixtures.f_array(queue, nx+6, ny+6, nz+6) + print("END ARRAY GEN") + + flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, + nz+6), dtype=np.float32, order="F") + + prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) + + if 0: + with open("gen-code.cl", "w") as outf: + outf.write(lp.generate_code_v2(prg).device_code()) + + prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) + prg = lp.set_options(prg, ignore_boostable_into=True) + prg = lp.set_options(prg, no_numpy=True) + #prg = lp.set_options(prg, write_wrapper=True) + #op_map = lp.get_op_map(prg, count_redundant_work=False) + #print(op_map) + + allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) + + from functools import partial + run = partial(prg, queue, nvars=nvars, ndim=ndim, + states=states, fluxes=fluxes, metrics=metrics, + metric_jacobians=metric_jacobians, + flux_derivatives=flux_derivatives_dev, + allocator=allocator) + + # {{{ monkeypatch enqueue_nd_range_kernel to trace + + if 0: + old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel + + def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): + print(f"Enqueueing {ker.function_name}") + return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) + + cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper + + # }}} + + print("warmup") + for iwarmup_round in range(2): + run() + + nrounds = 10 + + queue.finish() + print("timing") + from time import time + start = time() + + for iround in range(nrounds): + run() + + queue.finish() + one_round = (time() - start)/nrounds + + print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") + print(f"elapsed per round: {one_round} s") + print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + benchmark_compute_flux_derivatives_gpu(cl._csc): + #from pytest import main + #main([__file__]) diff --git a/fixtures.py b/fixtures.py index 73df2fefb9fddbc371eac6297cbdf25cf4245fb2..3ec8efc07a3f773f769e019c2854b01b0a68236f 100644 --- a/fixtures.py +++ b/fixtures.py @@ -11,6 +11,51 @@ from pytest import approx _WENO_PRG = [] _QUEUE = [] +def get_gpu_transformed_weno(): + prg = fixtures.get_weno_program() + + cfd = prg["compute_flux_derivatives"] + + cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") + + cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", + lp.AddressSpace.GLOBAL) + cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", + lp.AddressSpace.GLOBAL) + cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", + lp.AddressSpace.GLOBAL) + + for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: + cfd = lp.split_iname(cfd, "i"+suffix, 16, + outer_tag="g.0", inner_tag="l.0") + cfd = lp.split_iname(cfd, "j"+suffix, 16, + outer_tag="g.1", inner_tag="l.1") + + for var_name in ["delta_xi", "delta_eta", "delta_zeta"]: + cfd = lp.assignment_to_subst(cfd, var_name) + + cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute") + cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff") + cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute") + cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff") + cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute") + cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff") + cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized") + + prg = prg.with_kernel(cfd) + + # FIXME: These should work, but don't + # FIXME: Undo the hand-inlining in WENO.F90 + #prg = lp.inline_callable_kernel(prg, "convert_to_generalized") + #prg = lp.inline_callable_kernel(prg, "convert_from_generalized") + + if 0: + print(prg["convert_to_generalized_frozen"]) + 1/0 + + return prg + + def get_queue(ctx_factory): if not _QUEUE: ctx = ctx_factory() @@ -43,6 +88,10 @@ def get_weno_program(): _WENO_PRG.append(prg) return prg +def f_array(queue, *shape): + ary = np.random.random_sample(shape).astype(np.float32).copy(order="F") + return cl.array.to_device(queue, ary) + def random_array(*args): return np.random.random_sample(args).astype(np.float32).copy(order="F") diff --git a/test.py b/test.py index 1f4afa31dc186e82e19afa7163a829461a40d02a..5db0b73509d167009a80075d8cc12f546efc7d38 100644 --- a/test.py +++ b/test.py @@ -46,60 +46,10 @@ def test_compute_flux_derivatives(ctx_factory): metric_jacobians=metric_jacobians) -def f_array(queue, *shape): - ary = np.random.random_sample(shape).astype(np.float32).copy(order="F") - return cl.array.to_device(queue, ary) - - -def get_gpu_transformed_weno(): - prg = fixtures.get_weno_program() - - cfd = prg["compute_flux_derivatives"] - - cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") - - cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", - lp.AddressSpace.GLOBAL) - - for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: - cfd = lp.split_iname(cfd, "i"+suffix, 16, - outer_tag="g.0", inner_tag="l.0") - cfd = lp.split_iname(cfd, "j"+suffix, 16, - outer_tag="g.1", inner_tag="l.1") - - for var_name in ["delta_xi", "delta_eta", "delta_zeta"]: - cfd = lp.assignment_to_subst(cfd, var_name) - - cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute") - cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff") - cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute") - cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff") - cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute") - cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff") - cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized") - - prg = prg.with_kernel(cfd) - - # FIXME: These should work, but don't - # FIXME: Undo the hand-inlining in WENO.F90 - #prg = lp.inline_callable_kernel(prg, "convert_to_generalized") - #prg = lp.inline_callable_kernel(prg, "convert_from_generalized") - - if 0: - print(prg["convert_to_generalized_frozen"]) - 1/0 - - return prg - - def test_compute_flux_derivatives_gpu(ctx_factory): logging.basicConfig(level="INFO") - prg = get_gpu_transformed_weno() + prg = fixtures.get_gpu_transformed_weno() queue = fixtures.get_queue(ctx_factory) @@ -109,10 +59,10 @@ def test_compute_flux_derivatives_gpu(ctx_factory): ny = 10 nz = 10 - states = f_array(queue, nvars, nx+6, ny+6, nz+6) - fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) - metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) + states = fixtures.f_array(queue, nvars, nx+6, ny+6, nz+6) + fluxes = fixtures.f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) + metrics = fixtures.f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) + metric_jacobians = fixtures.f_array(queue, nx+6, ny+6, nz+6) flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, nz+6), dtype=np.float32, order="F") @@ -136,87 +86,6 @@ def test_compute_flux_derivatives_gpu(ctx_factory): flux_derivatives=flux_derivatives_dev) -def benchmark_compute_flux_derivatives_gpu(ctx_factory): - logging.basicConfig(level="INFO") - - prg = get_gpu_transformed_weno() - - queue = fixtures.get_queue(ctx_factory) - - ndim = 3 - nvars = 5 - n = 16*16 - nx = n - ny = n - nz = n - - print("ARRAY GEN") - states = f_array(queue, nvars, nx+6, ny+6, nz+6) - fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) - metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) - print("END ARRAY GEN") - - flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, - nz+6), dtype=np.float32, order="F") - - prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) - - if 0: - with open("gen-code.cl", "w") as outf: - outf.write(lp.generate_code_v2(prg).device_code()) - - prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) - prg = lp.set_options(prg, ignore_boostable_into=True) - prg = lp.set_options(prg, no_numpy=True) - #prg = lp.set_options(prg, write_wrapper=True) - #op_map = lp.get_op_map(prg, count_redundant_work=False) - #print(op_map) - - allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) - - from functools import partial - run = partial(prg, queue, nvars=nvars, ndim=ndim, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev, - allocator=allocator) - - # {{{ monkeypatch enqueue_nd_range_kernel to trace - - if 0: - old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel - - def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): - print(f"Enqueueing {ker.function_name}") - return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) - - cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper - - # }}} - - print("warmup") - for iwarmup_round in range(2): - run() - - nrounds = 10 - - queue.finish() - print("timing") - from time import time - start = time() - - for iround in range(nrounds): - run() - - queue.finish() - one_round = (time() - start)/nrounds - - print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") - print(f"elapsed per round: {one_round} s") - print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") - - # This lets you run 'python test.py test_case(cl._csc)' without pytest. if __name__ == "__main__": if len(sys.argv) > 1: