test.py

import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array  # noqa
import pyopencl.tools  # noqa
import pyopencl.clrandom  # noqa
import loopy as lp  # noqa
import sys

import logging

import pytest
from pyopencl.tools import (  # noqa
        pytest_generate_tests_for_pyopencl
        as pytest_generate_tests)

from fixtures import LoopyFixture


f = LoopyFixture()


def test_matvec(ctx_factory):
    a = f.random_array(10, 10)
    b = f.random_array(10)

    c = f.mult_mat_vec(ctx_factory, a=a, b=b, alpha=1.0)

    assert la.norm(a@b - c, 2)/la.norm(c) < 1e-5


def test_compute_flux_derivatives(ctx_factory):
    logging.basicConfig(level="INFO")

    ndim = 3
    nvars = 5
    nx = 10
    ny = 10
    nz = 10

    states = f.random_array(nvars, nx+6, ny+6, nz+6)
    fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
    metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
    metric_jacobians = f.random_array(nx+6, ny+6, nz+6)

    f.compute_flux_derivatives(ctx_factory,
            nvars=nvars, ndim=ndim, nx=nx, ny=ny, nz=nz,
            states=states, fluxes=fluxes, metrics=metrics,
            metric_jacobians=metric_jacobians)


def f_array(queue, *shape):
    ary = np.random.random_sample(shape).astype(np.float32).copy(order="F")
    return cl.array.to_device(queue, ary)


def get_gpu_transformed_weno():
    prg = f.prg

    cfd = prg["compute_flux_derivatives"]

    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")

    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
            lp.AddressSpace.GLOBAL)
    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
            lp.AddressSpace.GLOBAL)
    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
            lp.AddressSpace.GLOBAL)

    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
        cfd = lp.split_iname(cfd, "i"+suffix, 16,
                outer_tag="g.0", inner_tag="l.0")
        cfd = lp.split_iname(cfd, "j"+suffix, 16,
                outer_tag="g.1", inner_tag="l.1")

    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
        cfd = lp.assignment_to_subst(cfd, var_name)

    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")

    prg = prg.with_kernel(cfd)

    # FIXME: These should work, but don't
    # FIXME: Undo the hand-inlining in WENO.F90
    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")

    if 0:
        print(prg["convert_to_generalized_frozen"])
        1/0

    return prg


def test_compute_flux_derivatives_gpu(ctx_factory):
    logging.basicConfig(level="INFO")

    prg = get_gpu_transformed_weno()

    queue = f.get_queue(ctx_factory)

    ndim = 3
    nvars = 5
    nx = 10
    ny = 10
    nz = 10

    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)

    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
        nz+6), dtype=np.float32, order="F")

    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))

    if 1:
        with open("gen-code.cl", "w") as outf:
            outf.write(lp.generate_code_v2(prg).device_code())

    prg = lp.set_options(prg, no_numpy=True)

    prg(queue, nvars=nvars, ndim=ndim,
            states=states, fluxes=fluxes, metrics=metrics,
            metric_jacobians=metric_jacobians,
            flux_derivatives=flux_derivatives_dev)

    prg(queue, nvars=nvars, ndim=ndim,
            states=states, fluxes=fluxes, metrics=metrics,
            metric_jacobians=metric_jacobians,
            flux_derivatives=flux_derivatives_dev)


def benchmark_compute_flux_derivatives_gpu(ctx_factory):
    logging.basicConfig(level="INFO")

    prg = get_gpu_transformed_weno()

    queue = f.get_queue(ctx_factory)

    ndim = 3
    nvars = 5
    n = 16*16
    nx = n
    ny = n
    nz = n

    print("ARRAY GEN")
    states = f_array(queue, nvars, nx+6, ny+6, nz+6)
    fluxes = f_array(queue, nvars, ndim, nx+6, ny+6, nz+6)
    metrics = f_array(queue, ndim, ndim, nx+6, ny+6, nz+6)
    metric_jacobians = f_array(queue, nx+6, ny+6, nz+6)
    print("END ARRAY GEN")

    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
        nz+6), dtype=np.float32, order="F")

    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))

    if 0:
        with open("gen-code.cl", "w") as outf:
            outf.write(lp.generate_code_v2(prg).device_code())

    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
    prg = lp.set_options(prg, ignore_boostable_into=True)
    prg = lp.set_options(prg, no_numpy=True)
    #prg = lp.set_options(prg, write_wrapper=True)
    #op_map = lp.get_op_map(prg, count_redundant_work=False)
    #print(op_map)

    allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))

    from functools import partial
    run = partial(prg, queue, nvars=nvars, ndim=ndim,
            states=states, fluxes=fluxes, metrics=metrics,
            metric_jacobians=metric_jacobians,
            flux_derivatives=flux_derivatives_dev,
            allocator=allocator)

    # {{{ monkeypatch enqueue_nd_range_kernel to trace

    if 0:
        old_enqueue_nd_range_kernel = cl.enqueue_nd_range_kernel

        def enqueue_nd_range_kernel_wrapper(queue, ker, *args, **kwargs):
            print(f"Enqueueing {ker.function_name}")
            return old_enqueue_nd_range_kernel(queue, ker, *args, **kwargs)

        cl.enqueue_nd_range_kernel = enqueue_nd_range_kernel_wrapper

    # }}}

    print("warmup")
    for iwarmup_round in range(2):
        run()

    nrounds = 10

    queue.finish()
    print("timing")
    from time import time
    start = time()

    for iround in range(nrounds):
        run()

    queue.finish()
    one_round = (time() - start)/nrounds

    print(f"M RHSs/s: {ndim*nvars*n**3/one_round/1e6}")
    print(f"elapsed per round: {one_round} s")
    print(f"Output size: {flux_derivatives_dev.nbytes/1e6} MB")


# This lets you run 'python test.py test_case(cl._csc)' without pytest.
if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
        from pytest import main
        main([__file__])