Newer
Older
import pyopencl.array # noqa
import pyopencl.clrandom # noqa
Timothy A. Smith
committed
import pytest
from pyopencl.tools import ( # noqa
pytest_generate_tests_for_pyopencl
as pytest_generate_tests)
a = f.random_array(10, 10)
b = f.random_array(10)
c = f.mult_mat_vec(ctx_factory, a=a, b=b, alpha=1.0)
assert la.norm(a@b - c, 2)/la.norm(c) < 1e-5
@pytest.mark.skip("slow")
Kaushik Kulkarni
committed
def test_compute_flux_derivatives(ctx_factory):
Kaushik Kulkarni
committed
ndim = 3
nvars = 5
nx = 10
ny = 10
nz = 10
states = f.random_array(nvars, nx+6, ny+6, nz+6)
fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
Kaushik Kulkarni
committed
f.compute_flux_derivatives(ctx_factory,
nvars=nvars, ndim=ndim, nx=nx, ny=ny, nz=nz,
states=states, fluxes=fluxes, metrics=metrics,
metric_jacobians=metric_jacobians)
Kaushik Kulkarni
committed
prg = f.prg
cfd = prg["compute_flux_derivatives"]
cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
lp.AddressSpace.GLOBAL)
cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
lp.AddressSpace.GLOBAL)
cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
lp.AddressSpace.GLOBAL)
for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6"]:
cfd = lp.split_iname(cfd, "i"+suffix, 16,
outer_tag="g.0", inner_tag="l.0")
cfd = lp.split_iname(cfd, "j"+suffix, 16,
outer_tag="g.1", inner_tag="l.1")
for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
cfd = lp.assignment_to_subst(cfd, var_name)
cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
prg = prg.with_kernel(cfd)
# FIXME: These should work, but don't
# FIXME: Undo the hand-inlining in WENO.F90
#prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
#prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
if 0:
print(prg["convert_to_generalized_frozen"])
1/0
Timothy A. Smith
committed
@pytest.mark.skip("fails")
def test_compute_flux_derivatives_gpu(ctx_factory):
logging.basicConfig(level="INFO")
prg = get_gpu_transformed_weno()
queue = f.get_queue(ctx_factory)
ndim = 3
nvars = 5
nx = 10
ny = 10
nz = 10
states = f.random_array(nvars, nx+6, ny+6, nz+6)
fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
nz+6), dtype=np.float32, order="F")
with open("gen-code.cl", "w") as outf:
outf.write(lp.generate_code_v2(prg).device_code())
prg(queue, nvars=nvars, ndim=ndim,
states=states, fluxes=fluxes, metrics=metrics,
metric_jacobians=metric_jacobians,
flux_derivatives=flux_derivatives_dev)
def benchmark_compute_flux_derivatives_gpu(ctx_factory):
logging.basicConfig(level="INFO")
prg = get_gpu_transformed_weno()
queue = f.get_queue(ctx_factory)
ndim = 3
nvars = 5
n = 100
nx = n
ny = n
nz = n
states = f.random_array(nvars, nx+6, ny+6, nz+6)
fluxes = f.random_array(nvars, ndim, nx+6, ny+6, nz+6)
metrics = f.random_array(ndim, ndim, nx+6, ny+6, nz+6)
metric_jacobians = f.random_array(nx+6, ny+6, nz+6)
flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
nz+6), dtype=np.float32, order="F")
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
if 0:
with open("gen-code.cl", "w") as outf:
outf.write(lp.generate_code_v2(prg).device_code())
prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
prg = lp.set_options(prg, ignore_boostable_into=True, write_wrapper=True)
#op_map = lp.get_op_map(prg, count_redundant_work=False)
#print(op_map)
from functools import partial
run = partial(prg, queue, nvars=nvars, ndim=ndim,
states=states, fluxes=fluxes, metrics=metrics,
metric_jacobians=metric_jacobians,
flux_derivatives=flux_derivatives_dev)
print("warmup")
for iwarmup_round in range(2):
run()
nrounds = 10
queue.finish()
print("timing")
from time import time
start = time()
for iround in range(nrounds):
run()
queue.finish()
one_round = (time() - start)/nrounds
print(f"DOFs/s: {n**3/one_round}, elapsed per round: {one_round} s")
# This lets you run 'python test.py test_case(cl._csc)' without pytest.
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
else:
from pytest import main
main([__file__])