diff --git a/WENO.F90 b/WENO.F90 index e8618ef65764d2f64d310a023855a0651cfe8652..829080e5f56556ffcc276e33a5ad8d0aa0628ab6 100644 --- a/WENO.F90 +++ b/WENO.F90 @@ -303,9 +303,9 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam if (im > 3) im = im - 3 do j=1,2 - u_orig(1,j) = states(ik+1,j)/states(1,j) - u_orig(2,j) = states(il+1,j)/states(1,j) - u_orig(3,j) = states(im+1,j)/states(1,j) + do i=1,ndim + u_orig(i,j) = states(i+1,j)/states(1,j) + end do end do do j=1,2 @@ -350,7 +350,7 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam c = sqrt((1.4 - 1.0)*(H - 0.5*q)) b1 = (1.4 - 1.0)/(c**2) - b2 = 1.0 + b1*q**2 - b1*H + b2 = 1.0 + b1*q - b1*H u_tilde(1) = 0.0 do i=1,ndim @@ -370,11 +370,11 @@ subroutine roe_eigensystem(nvars, ndim, d, states, metrics_frozen, R, R_inv, lam alpha = rho/(2.0*c) beta = 1.0/(2.0*alpha) - lambda_roe(1) = u(1) - lambda_roe(2) = u(1) - lambda_roe(3) = u(1) - lambda_roe(4) = u(1) + c - lambda_roe(5) = u(1) - c + lambda_roe(1) = u_tilde(1)*metric_norm(ik) + lambda_roe(2) = u_tilde(1)*metric_norm(ik) + lambda_roe(3) = u_tilde(1)*metric_norm(ik) + lambda_roe(4) = u_tilde(1)*metric_norm(ik) + c*metric_norm(ik) + lambda_roe(5) = u_tilde(1)*metric_norm(ik) - c*metric_norm(ik) R(1,1) = 1.0 R(2,1) = u(1) @@ -951,6 +951,20 @@ end subroutine ! ! prg = lp.parse_fortran(lp.c_preprocess(SOURCE), FILENAME) ! prg = lp.fix_parameters(prg, ndim=3, nvars=5, _remove=False) +! +! cfd = prg["compute_flux_derivatives"] +! +! cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") +! +! cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", +! lp.AddressSpace.GLOBAL) +! cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", +! lp.AddressSpace.GLOBAL) +! cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", +! lp.AddressSpace.GLOBAL) +! +! prg = prg.with_kernel(cfd) +! ! RESULT = prg ! !$loopy end diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..71859c74a50af454451a47b6bfa2a176114c66fb --- /dev/null +++ b/benchmark.py @@ -0,0 +1,96 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array # noqa +import pyopencl.tools # noqa +import pyopencl.clrandom # noqa +import loopy as lp # noqa +import sys + +import logging + +from utilities import * + + +def benchmark_compute_flux_derivatives_gpu(ctx_factory, write_code=False): + logging.basicConfig(level="INFO") + + prg = get_weno_program() + prg = transform_weno_for_gpu(prg) + + queue = get_queue(ctx_factory) + prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) + prg = lp.set_options(prg, no_numpy=True) + prg = lp.set_options(prg, ignore_boostable_into=True) + #prg = lp.set_options(prg, write_wrapper=True) + #op_map = lp.get_op_map(prg, count_redundant_work=False) + #print(op_map) + + ndim = 3 + nvars = 5 + n = 16*16 + nx = n + ny = n + nz = n + + print("ARRAY GEN") + states = random_array_on_device(queue, nvars, nx+6, ny+6, nz+6) + fluxes = random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6) + metrics = random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6) + metric_jacobians = random_array_on_device(queue, nx+6, ny+6, nz+6) + print("END ARRAY GEN") + + flux_derivatives_dev = empty_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6) + + if write_code: + write_target_device_code(prg) + + allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) + + from functools import partial + run = partial(prg, queue, nvars=nvars, ndim=ndim, + states=states, fluxes=fluxes, metrics=metrics, + metric_jacobians=metric_jacobians, + flux_derivatives=flux_derivatives_dev, + allocator=allocator) + + # {{{ monkeypatch enqueue_nd_range_kernel to trace + + if 0: + old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel + + def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): + print(f"Enqueueing {ker.function_name}") + return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) + + cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper + + # }}} + + print("warmup") + for iwarmup_round in range(2): + run() + + nrounds = 10 + + queue.finish() + print("timing") + from time import time + start = time() + + for iround in range(nrounds): + run() + + queue.finish() + one_round = (time() - start)/nrounds + + print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") + print(f"elapsed per round: {one_round} s") + print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + benchmark_compute_flux_derivatives_gpu(cl._csc) diff --git a/build-and-test-py-project.sh b/build-and-test-py-project.sh index 0810a3871a048bb83a6f317e6eba1fbc8dc00fa3..8bfbc06940aa87368d22708756076e22b9e615a8 100644 --- a/build-and-test-py-project.sh +++ b/build-and-test-py-project.sh @@ -55,9 +55,7 @@ export XDG_CACHE_HOME=$HOME/.cache/$CI_RUNNER_ID $PIP install --upgrade pip $PIP install setuptools -# Pinned to 3.0.4 because of https://github.com/pytest-dev/pytest/issues/2434 -# Install before a newer version gets pulled in as a dependency -$PIP install pytest==3.0.4 pytest-warnings==0.2.0 +$PIP install pytest if test "$EXTRA_INSTALL" != ""; then for i in $EXTRA_INSTALL ; do @@ -87,6 +85,6 @@ if test -f $REQUIREMENTS_TXT; then $PIP install -r $REQUIREMENTS_TXT fi -${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxsw test.py +${PY_EXE} -m pytest -rw --durations=10 --tb=native --junitxml=pytest.xml -rxsw --runslow test.py # vim: foldmethod=marker diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..29be9d5330e9401e60381cb48a17fd6de4ed5559 --- /dev/null +++ b/conftest.py @@ -0,0 +1,21 @@ +# setup to mark slow tests with @pytest.mark.slow, so that they don't run by +# default, but can be forced to run with the command-line option --runslow +# taken from +# https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option + +import pytest + + +def pytest_addoption(parser): + parser.addoption("--runslow", action="store_true", default=False, + help="run slow tests") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runslow"): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason="need --runslow option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/fixtures.py b/fixtures.py deleted file mode 100644 index 257f58216af3fa307929dec6a4b79d34252ebfc5..0000000000000000000000000000000000000000 --- a/fixtures.py +++ /dev/null @@ -1,89 +0,0 @@ -import numpy as np -import numpy.linalg as la # noqa -import pyopencl as cl -import pyopencl.array # noqa -import pyopencl.clrandom # noqa -import loopy as lp - -from pytest import approx - - -def with_root_kernel(prg, root_name): - # FIXME This is a little less beautiful than it could be - new_prg = prg.copy(name=root_name) - for name in prg: - clbl = new_prg[name] - if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host: - new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False)) - - new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True)) - return new_prg - - -class LoopyFixture: - _WENO_PRG = [] - _QUEUE = [] - - def __init__(self): - self.prg = self.get_weno_program() - - def get_weno_program(self): - if self._WENO_PRG: - return self._WENO_PRG[0] - - fn = "WENO.F90" - - with open(fn, "r") as infile: - infile_content = infile.read() - - prg = lp.parse_transformed_fortran(infile_content, filename=fn) - self._WENO_PRG.append(prg) - return prg - - def get_queue(self, ctx_factory): - if not self._QUEUE: - ctx = ctx_factory() - self._QUEUE.append(cl.CommandQueue(ctx)) - return self._QUEUE[0] - - def random_array(self, *args): - return np.random.random_sample(args).astype(np.float32).copy(order="F") - - def mult_mat_vec(self, ctx_factory, alpha, a, b): - queue = self.get_queue(ctx_factory) - - c_dev = cl.array.empty(queue, 10, dtype=np.float32) - - prg = with_root_kernel(self.prg, "mult_mat_vec") - prg(queue, a=a, b=b, c=c_dev, alpha=alpha) - - return c_dev.get() - - def compute_flux_derivatives(self, ctx_factory, - nvars, ndim, nx, ny, nz, - states, fluxes, metrics, metric_jacobians): - - queue = self.get_queue(ctx_factory) - - prg = self.prg - cfd = prg["compute_flux_derivatives"] - - cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") - - cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", - lp.AddressSpace.GLOBAL) - - prg = prg.with_kernel(cfd) - - flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, - nz+6), dtype=np.float32, order="F") - - prg(queue, nvars=nvars, ndim=ndim, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev) - return flux_derivatives_dev.get() diff --git a/test.py b/test.py index 93890bcff9472def502c033cf228c3d563e14b84..97ccfe7cc1d668e3ccfeae5e4bb4dfd5410686ca 100644 --- a/test.py +++ b/test.py @@ -5,225 +5,123 @@ import pyopencl.array # noqa import pyopencl.tools # noqa import pyopencl.clrandom # noqa import loopy as lp # noqa -import sys +import sys import logging import pytest +from pytest import approx from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) -from fixtures import LoopyFixture - - -f = LoopyFixture() - - -def test_matvec(ctx_factory): - a = f.random_array(10, 10) - b = f.random_array(10) +from utilities import * + + +@pytest.mark.parametrize("states_str,fluxes_str,direction", [ + ("2 4 4 4 20,1 1 1 1 5.5", "4 11.2 8 8 46.4,1 2.6 1 1 7.1", "x"), + ("2 4 4 4 20,1 1 1 1 5.5", "4 8 11.2 8 46.4,1 1 2.6 1 7.1", "y"), + ("2 4 4 4 20,1 1 1 1 5.5", "4 8 8 11.2 46.4,1 1 1 2.6 7.1", "z"), + ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 2.6 1 1 -7.1,-4 11.2 8 8 -46.4", "x"), + ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 1 2.6 1 -7.1,-4 8 11.2 8 -46.4", "y"), + ("1 -1 -1 -1 5.5,2 -4 -4 -4 20", "-1 1 1 2.6 -7.1,-4 8 8 11.2 -46.4", "z"), + ("2 4 8 12 64,1 1 2 3 11", "4 11.2 16 24 134.4,1 2.6 2 3 12.6", "x"), + ("2 4 8 12 64,1 1 2 3 11", "8 16 35.2 48 268.8,2 2 5.6 6 25.2", "y"), + ("2 4 8 12 64,1 1 2 3 11", "12 24 48 75.2 403.2,3 3 6 10.6 37.8", "z"), + ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-1 2.6 2 3 -12.6,-4 11.2 16 24 -134.4", "x"), + ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-2 2 5.6 6 -25.2,-8 16 35.2 48 -268.8", "y"), + ("1 -1 -2 -3 11,2 -4 -8 -12 64", "-3 3 6 10.6 -37.8,-12 24 48 75.2 -403.2", "z") + ]) +def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction): + def identity_matrix(n): + return np.identity(n).astype(np.float32).copy(order="F") + + def check_roe_identity(states, R, Rinv): + dState = states[:,1] - states[:,0] + compare_arrays(R@(Rinv@dState), dState) + + def check_roe_property(states, fluxes, R, Rinv, lam): + dState = states[:,1] - states[:,0] + dFlux = fluxes[:,1] - fluxes[:,0] + + temp = Rinv@dState + temp = np.multiply(lam, temp) + compare_arrays(R@temp, dFlux) + + prg = get_weno_program_with_root_kernel("roe_eigensystem") + queue = get_queue(ctx_factory) - c = f.mult_mat_vec(ctx_factory, a=a, b=b, alpha=1.0) - - assert la.norm(a@b - c, 2)/la.norm(c) < 1e-5 - - -def test_compute_flux_derivatives(ctx_factory): - logging.basicConfig(level="INFO") - - ndim = 3 nvars = 5 - nx = 10 - ny = 10 - nz = 10 - - states = f.random_array(nvars, nx+6, ny+6, nz+6) - fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6) - metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f.random_array(nx+6, ny+6, nz+6) - - f.compute_flux_derivatives(ctx_factory, - nvars=nvars, ndim=ndim, nx=nx, ny=ny, nz=nz, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians) - - -def f_array(queue, *shape): - ary = np.random.random_sample(shape).astype(np.float32).copy(order="F") - return cl.array.to_device(queue, ary) - - -def get_gpu_transformed_weno(): - prg = f.prg - - cfd = prg["compute_flux_derivatives"] - - cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0") - - cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "generalized_fluxes", - lp.AddressSpace.GLOBAL) - cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp", - lp.AddressSpace.GLOBAL) + ndim = 3 + dirs = {"x" : 1, "y" : 2, "z" : 3} - for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: - cfd = lp.split_iname(cfd, "i"+suffix, 16, - outer_tag="g.0", inner_tag="l.0") - cfd = lp.split_iname(cfd, "j"+suffix, 16, - outer_tag="g.1", inner_tag="l.1") + states = transposed_array_from_string(states_str) + fluxes = transposed_array_from_string(fluxes_str) + metrics_frozen = identity_matrix(ndim) - for var_name in ["delta_xi", "delta_eta", "delta_zeta"]: - cfd = lp.assignment_to_subst(cfd, var_name) + R_dev = empty_array_on_device(queue, nvars, nvars) + Rinv_dev = empty_array_on_device(queue, nvars, nvars) + lam_dev = empty_array_on_device(queue, nvars) - cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute") - cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff") - cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute") - cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff") - cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute") - cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff") - cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized") + prg(queue, nvars=nvars, ndim=ndim, d=dirs[direction], + states=states, metrics_frozen=metrics_frozen, + R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev) - prg = prg.with_kernel(cfd) + R = R_dev.get() + Rinv = Rinv_dev.get() + lam = lam_dev.get() - # FIXME: These should work, but don't - # FIXME: Undo the hand-inlining in WENO.F90 - #prg = lp.inline_callable_kernel(prg, "convert_to_generalized") - #prg = lp.inline_callable_kernel(prg, "convert_from_generalized") + print(lam) - if 0: - print(prg["convert_to_generalized_frozen"]) - 1/0 + check_roe_identity(states, R, Rinv) + check_roe_property(states, fluxes, R, Rinv, lam) - return prg +def test_matvec(ctx_factory): + prg = get_weno_program_with_root_kernel("mult_mat_vec") + queue = get_queue(ctx_factory) -def test_compute_flux_derivatives_gpu(ctx_factory): - logging.basicConfig(level="INFO") + a = random_array_on_device(queue, 10, 10) + b = random_array_on_device(queue, 10) - prg = get_gpu_transformed_weno() + c = empty_array_on_device(queue, 10) - queue = f.get_queue(ctx_factory) + prg(queue, alpha=1.0, a=a, b=b, c=c) - ndim = 3 - nvars = 5 - nx = 10 - ny = 10 - nz = 10 + compare_arrays(a.get()@b.get(), c.get()) - states = f_array(queue, nvars, nx+6, ny+6, nz+6) - fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) - metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) - flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, - nz+6), dtype=np.float32, order="F") +@pytest.mark.slow +def test_compute_flux_derivatives(ctx_factory): + prg = get_weno_program() + queue = get_queue(ctx_factory) prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) - if 1: - with open("gen-code.cl", "w") as outf: - outf.write(lp.generate_code_v2(prg).device_code()) - - prg = lp.set_options(prg, no_numpy=True) - - prg(queue, nvars=nvars, ndim=ndim, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev) - - prg(queue, nvars=nvars, ndim=ndim, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev) - - -def benchmark_compute_flux_derivatives_gpu(ctx_factory): - logging.basicConfig(level="INFO") - - prg = get_gpu_transformed_weno() - - queue = f.get_queue(ctx_factory) - - ndim = 3 - nvars = 5 - n = 16*16 - nx = n - ny = n - nz = n - - print("ARRAY GEN") - states = f_array(queue, nvars, nx+6, ny+6, nz+6) - fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6) - metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6) - metric_jacobians = f_array(queue, nx+6, ny+6, nz+6) - print("END ARRAY GEN") + lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1, + parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16)) - flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6, - nz+6), dtype=np.float32, order="F") - prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) - - if 0: - with open("gen-code.cl", "w") as outf: - outf.write(lp.generate_code_v2(prg).device_code()) +@pytest.mark.slow +def test_compute_flux_derivatives_gpu(ctx_factory, write_code=False): + prg = get_weno_program() + prg = transform_weno_for_gpu(prg) + queue = get_queue(ctx_factory) prg = prg.copy(target=lp.PyOpenCLTarget(queue.device)) - prg = lp.set_options(prg, ignore_boostable_into=True) prg = lp.set_options(prg, no_numpy=True) - #prg = lp.set_options(prg, write_wrapper=True) - #op_map = lp.get_op_map(prg, count_redundant_work=False) - #print(op_map) - - allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) - - from functools import partial - run = partial(prg, queue, nvars=nvars, ndim=ndim, - states=states, fluxes=fluxes, metrics=metrics, - metric_jacobians=metric_jacobians, - flux_derivatives=flux_derivatives_dev, - allocator=allocator) - - # {{{ monkeypatch enqueue_nd_range_kernel to trace - - if 0: - old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel - - def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs): - print(f"Enqueueing {ker.function_name}") - return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs) - - cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper - - # }}} - - print("warmup") - for iwarmup_round in range(2): - run() - - nrounds = 10 - - queue.finish() - print("timing") - from time import time - start = time() - - for iround in range(nrounds): - run() - queue.finish() - one_round = (time() - start)/nrounds + if write_code: + write_target_device_code(prg) - print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}") - print(f"elapsed per round: {one_round} s") - print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB") + lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1, + parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16)) # This lets you run 'python test.py test_case(cl._csc)' without pytest. if __name__ == "__main__": if len(sys.argv) > 1: + logging.basicConfig(level="INFO") exec(sys.argv[1]) else: - from pytest import main - main([__file__]) + pytest.main([__file__]) diff --git a/utilities.py b/utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..b825fcbe82ee7ef0d2a991edf9ed70f136b55f80 --- /dev/null +++ b/utilities.py @@ -0,0 +1,162 @@ +import numpy as np +import numpy.linalg as la +import pyopencl as cl +import pyopencl.array # noqa +import pyopencl.tools # noqa +import pyopencl.clrandom # noqa +import loopy as lp # noqa +from pytest import approx + + +# {{{ arrays + +def compare_arrays(a, b): + assert a == approx(b) + + +def random_array_on_device(queue, *shape): + ary = empty_array_on_device(queue, *shape) + cl.clrandom.fill_rand(ary) + return ary + + +def empty_array_on_device(queue, *shape): + return cl.array.empty(queue, shape, dtype=np.float32, order="F") + + +def arrays_from_string(string_arrays): + return split_map_to_list(string_arrays, array_from_string, ":") + + +def transposed_array_from_string(string_array): + return array_from_string(string_array).transpose().copy(order="F") + + +def array_from_string(string_array): + def array_from_string_1d(string_array): + if string_array[0] == "i": + return np.array(split_map_to_list(string_array[1:], int, " ")) + else: + return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32) + + def array_from_string_2d(string_array): + if string_array[0] == ",": + return array_from_string_1d(string_array[1:]).reshape((-1, 1)) + else: + return np.array(split_map_to_list(string_array, array_from_string_1d, ",")) + + def array_from_string_3d(string_array): + if string_array[0] == ";": + return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1)) + else: + return np.array(split_map_to_list(string_array, array_from_string_2d, ";")) + + if ";" not in string_array: + if "," not in string_array: + array = array_from_string_1d(string_array) + else: + array = array_from_string_2d(string_array) + else: + array = array_from_string_3d(string_array) + return array.copy(order="F") + + +def split_map_to_list(string, map_func, splitter): + return list(map(map_func, string.split(splitter))) + +# }}} + +# {{{ device + +def get_queue(ctx_factory): + ctx = ctx_factory() + return cl.CommandQueue(ctx) + +# }}} + +# {{{ program / kernel + +_WENO_PRG = {} + + +def parse_weno(): + fn = "WENO.F90" + + with open(fn, "r") as infile: + infile_content = infile.read() + + prg = lp.parse_transformed_fortran(infile_content, filename=fn) + _WENO_PRG["default"] = prg + + +def get_weno_program(): + if not _WENO_PRG: + parse_weno() + return _WENO_PRG["default"] + + +def with_root_kernel(prg, root_name): + # FIXME This is a little less beautiful than it could be + new_prg = prg.copy(name=root_name) + for name in prg: + clbl = new_prg[name] + if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host: + new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False)) + + new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True)) + return new_prg + + +def setup_weno_program_with_root_kernel(knl): + prg = get_weno_program() + prg = with_root_kernel(prg, knl) + _WENO_PRG[knl] = prg + + +def get_weno_program_with_root_kernel(knl): + if not knl in _WENO_PRG: + setup_weno_program_with_root_kernel(knl) + return _WENO_PRG[knl] + + +def transform_weno_for_gpu(prg, print_kernel=False): + cfd = prg["compute_flux_derivatives"] + + for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]: + cfd = lp.split_iname(cfd, "i"+suffix, 16, + outer_tag="g.0", inner_tag="l.0") + cfd = lp.split_iname(cfd, "j"+suffix, 16, + outer_tag="g.1", inner_tag="l.1") + + for var_name in ["delta_xi", "delta_eta", "delta_zeta"]: + cfd = lp.assignment_to_subst(cfd, var_name) + + cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute") + cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff") + cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute") + cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff") + cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute") + cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff") + cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized") + + prg = prg.with_kernel(cfd) + + # FIXME: These should work, but don't + # FIXME: Undo the hand-inlining in WENO.F90 + #prg = lp.inline_callable_kernel(prg, "convert_to_generalized") + #prg = lp.inline_callable_kernel(prg, "convert_from_generalized") + + if print_kernel: + print(prg["convert_to_generalized_frozen"]) + 1/0 + + return prg + + +def write_target_device_code(prg, outfilename="gen-code.cl"): + with open(outfilename, "w") as outf: + outf.write(lp.generate_code_v2(prg).device_code()) + +# }}} + +# vim: foldmethod=marker