import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array # noqa import pyopencl.tools # noqa import pyopencl.clrandom # noqa import loopy as lp # noqa import sys import logging import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import fixtures def test_matvec(ctx_factory): a = fixtures.random_array(10, 10) b = fixtures.random_array(10) c = fixtures.mult_mat_vec(ctx_factory, a=a, b=b, alpha=1.0) assert la.norm(a@b - c, 2)/la.norm(c) < 1e-5 def test_compute_flux_derivatives(ctx_factory): logging.basicConfig(level="INFO") ndim = 3 nvars = 5 nx = 10 ny = 10 nz = 10 states = fixtures.random_array(nvars, nx+6, ny+6, nz+6) fluxes = fixtures.random_array(nvars, ndim, nx+6, ny+6, nz+6) metrics = fixtures.random_array(ndim, ndim, nx+6, ny+6, nz+6) metric_jacobians = fixtures.random_array(nx+6, ny+6, nz+6) fixtures.compute_flux_derivatives(ctx_factory, nvars=nvars, ndim=ndim, nx=nx, ny=ny, nz=nz, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians) def f_array(queue, *shape): ary = np.random.random_sample(shape).astype(np.float32).copy(order="F") return cl.array.to_device(queue, ary) def get_gpu_transformed_weno(): prg = fixtures.get_weno_program() cfd = prg["compute_flux_derivatives"] cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", lp.AddressSpace.GLOBAL) cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", lp.AddressSpace.GLOBAL) cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", lp.AddressSpace.GLOBAL) for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: cfd = lp.split_iname(cfd, "i"+suffix, 16, outer_tag="g.0", inner_tag="l.0") cfd = lp.split_iname(cfd, "j"+suffix, 16, outer_tag="g.1", inner_tag="l.1") for var_name in ["delta_xi", "delta_eta", "delta_zeta"]: cfd = lp.assignment_to_subst(cfd, var_name) cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute") cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff") cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute") cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff") cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute") cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff") cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized") prg = prg.with_kernel(cfd) # FIXME: These should work, but don't # FIXME: Undo the hand-inlining in WENO.F90 #prg = lp.inline_callable_kernel(prg, "convert_to_generalized") #prg = lp.inline_callable_kernel(prg, "convert_from_generalized") if 0: print(prg["convert_to_generalized_frozen"]) 1/0 return prg def test_compute_flux_derivatives_gpu(ctx_factory): logging.basicConfig(level="INFO") prg = get_gpu_transformed_weno() queue = fixtures.get_queue(ctx_factory) ndim = 3 nvars = 5 nx = 10 ny = 10 nz = 10 states = f_array(queue, nvars, nx+6, ny+6, nz+6) fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, nz+6), dtype=np.float32, order="F") prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) if 1: with open("gen-code.cl", "w") as outf: outf.write(lp.generate_code_v2(prg).device_code()) prg = lp.set_options(prg, no_numpy=True) prg(queue, nvars=nvars, ndim=ndim, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians, flux_derivatives=flux_derivatives_dev) prg(queue, nvars=nvars, ndim=ndim, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians, flux_derivatives=flux_derivatives_dev) def benchmark_compute_flux_derivatives_gpu(ctx_factory): logging.basicConfig(level="INFO") prg = get_gpu_transformed_weno() queue = fixtures.get_queue(ctx_factory) ndim = 3 nvars = 5 n = 16*16 nx = n ny = n nz = n print("ARRAY GEN") states = f_array(queue, nvars, nx+6, ny+6, nz+6) fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) print("END ARRAY GEN") flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, nz+6), dtype=np.float32, order="F") prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) if 0: with open("gen-code.cl", "w") as outf: outf.write(lp.generate_code_v2(prg).device_code()) prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) prg = lp.set_options(prg, ignore_boostable_into=True) prg = lp.set_options(prg, no_numpy=True) #prg = lp.set_options(prg, write_wrapper=True) #op_map = lp.get_op_map(prg, count_redundant_work=False) #print(op_map) allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) from functools import partial run = partial(prg, queue, nvars=nvars, ndim=ndim, states=states, fluxes=fluxes, metrics=metrics, metric_jacobians=metric_jacobians, flux_derivatives=flux_derivatives_dev, allocator=allocator) # {{{ monkeypatch enqueue_nd_range_kernel to trace if 0: old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): print(f"Enqueueing {ker.function_name}") return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper # }}} print("warmup") for iwarmup_round in range(2): run() nrounds = 10 queue.finish() print("timing") from time import time start = time() for iround in range(nrounds): run() queue.finish() one_round = (time() - start)/nrounds print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") print(f"elapsed per round: {one_round} s") print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") # This lets you run 'python test.py test_case(cl._csc)' without pytest. if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: from pytest import main main([__file__])