From f3a2d6b2a0787465f56ac97cdccc802edfd4905f Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 10:59:11 -0500
Subject: [PATCH 01/21] run all tests, even slow ones

---
 test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test.py b/test.py
index 924171f..e97a345 100644
--- a/test.py
+++ b/test.py
@@ -53,7 +53,7 @@ def test_matvec(ctx_factory):
     compare.arrays(a@b, c)
 
 
-@pytest.mark.slow
+#@pytest.mark.slow
 def test_compute_flux_derivatives(ctx_factory):
     queue = device.get_queue(ctx_factory)
     prg = program.get_weno()
@@ -65,7 +65,7 @@ def test_compute_flux_derivatives(ctx_factory):
     kernel.compute_flux_derivatives(queue, prg, params, arrays)
 
 
-@pytest.mark.slow
+#@pytest.mark.slow
 def test_compute_flux_derivatives_gpu(ctx_factory):
     queue = device.get_queue(ctx_factory)
     prg = program.get_weno()
-- 
GitLab


From 0b99456599c7673f20182efe728de57d40afc05e Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 11:03:12 -0500
Subject: [PATCH 02/21] move comparison fixtures to main test file

---
 comparison_fixtures.py | 20 --------------------
 test.py                | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 24 deletions(-)
 delete mode 100644 comparison_fixtures.py

diff --git a/comparison_fixtures.py b/comparison_fixtures.py
deleted file mode 100644
index 04c7432..0000000
--- a/comparison_fixtures.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import numpy as np
-from pytest import approx
-
-
-def arrays(a, b):
-    assert a == approx(b)
-
-
-def roe_identity(states, R, Rinv):
-    dState = states[:,1] - states[:,0]
-    arrays(R@(Rinv@dState), dState)
-
-
-def roe_property(states, fluxes, R, Rinv, lam):
-    dState = states[:,1] - states[:,0]
-    dFlux = fluxes[:,1] - fluxes[:,0]
-
-    temp = Rinv@dState
-    temp = np.multiply(lam, temp)
-    arrays(R@temp, dFlux)
diff --git a/test.py b/test.py
index e97a345..d48f18a 100644
--- a/test.py
+++ b/test.py
@@ -1,7 +1,16 @@
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
+import pyopencl.clrandom  # noqa
+import loopy as lp  # noqa
+
 import sys
 import logging
 
 import pytest
+from pytest import approx
 import pyopencl as cl
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
@@ -12,7 +21,25 @@ import program_fixtures as program
 import transform_fixtures as transform
 import setup_fixtures as setup
 import kernel_fixtures as kernel
-import comparison_fixtures as compare
+
+
+def compare_arrays(a, b):
+    assert a == approx(b)
+
+
+def compare_roe_identity(states, R, Rinv):
+    dState = states[:,1] - states[:,0]
+    compare_arrays(R@(Rinv@dState), dState)
+
+
+def compare_roe_property(states, fluxes, R, Rinv, lam):
+    dState = states[:,1] - states[:,0]
+    dFlux = fluxes[:,1] - fluxes[:,0]
+
+    temp = Rinv@dState
+    temp = np.multiply(lam, temp)
+    compare_arrays(R@temp, dFlux)
+
 
 @pytest.mark.xfail
 @pytest.mark.parametrize("states_str,fluxes_str,direction", [
@@ -35,10 +62,10 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
     metrics_frozen = setup.identity(params.ndim)
     R, Rinv, lam = kernel.roe_eigensystem(queue, prg, params, states, metrics_frozen)
 
-    compare.roe_identity(states, R, Rinv)
+    compare_roe_identity(states, R, Rinv)
 
     fluxes = setup.array_from_string(fluxes_str)
-    compare.roe_property(states, fluxes, R, Rinv, lam)
+    compare_roe_property(states, fluxes, R, Rinv, lam)
 
 
 def test_matvec(ctx_factory):
@@ -50,7 +77,7 @@ def test_matvec(ctx_factory):
 
     c = kernel.mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
 
-    compare.arrays(a@b, c)
+    compare_arrays(a@b, c)
 
 
 #@pytest.mark.slow
-- 
GitLab


From 440ecc0f045eb2fa94091e0ff805000d858386f4 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 12:05:08 -0500
Subject: [PATCH 03/21] put transformation fixtures in test.py

---
 test.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/test.py b/test.py
index d48f18a..6b42fdd 100644
--- a/test.py
+++ b/test.py
@@ -18,7 +18,6 @@ from pyopencl.tools import (  # noqa
 
 import device_fixtures as device
 import program_fixtures as program
-import transform_fixtures as transform
 import setup_fixtures as setup
 import kernel_fixtures as kernel
 
@@ -41,6 +40,70 @@ def compare_roe_property(states, fluxes, R, Rinv, lam):
     compare_arrays(R@temp, dFlux)
 
 
+def transform_compute_flux_derivative_basic(prg):
+    cfd = prg["compute_flux_derivatives"]
+
+    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+
+    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
+            lp.AddressSpace.GLOBAL)
+
+    return prg.with_kernel(cfd)
+
+
+def transform_weno_for_gpu(prg):
+    prg = transform_compute_flux_derivative_basic(prg)
+
+    cfd = prg["compute_flux_derivatives"]
+
+    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
+        cfd = lp.split_iname(cfd, "i"+suffix, 16,
+                outer_tag="g.0", inner_tag="l.0")
+        cfd = lp.split_iname(cfd, "j"+suffix, 16,
+                outer_tag="g.1", inner_tag="l.1")
+
+    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
+        cfd = lp.assignment_to_subst(cfd, var_name)
+
+    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
+    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
+
+    prg = prg.with_kernel(cfd)
+
+    # FIXME: These should work, but don't
+    # FIXME: Undo the hand-inlining in WENO.F90
+    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
+    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
+
+    if 0:
+        print(prg["convert_to_generalized_frozen"])
+        1/0
+
+    return prg
+
+
+def transform_compute_flux_derivative_gpu(queue, prg):
+    prg = transform_weno_for_gpu(prg)
+
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+
+    if 1:
+        with open("gen-code.cl", "w") as outf:
+            outf.write(lp.generate_code_v2(prg).device_code())
+
+    prg = lp.set_options(prg, no_numpy=True)
+    return prg
+
+
 @pytest.mark.xfail
 @pytest.mark.parametrize("states_str,fluxes_str,direction", [
     ("2 1,4 1,4 1,4 1,20 5.5", "4 1,11.2 2.6,8 1,8 1,46.4 7.1", "x"),
@@ -84,7 +147,7 @@ def test_matvec(ctx_factory):
 def test_compute_flux_derivatives(ctx_factory):
     queue = device.get_queue(ctx_factory)
     prg = program.get_weno()
-    prg = transform.compute_flux_derivative_basic(prg)
+    prg = transform_compute_flux_derivative_basic(prg)
 
     params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup.random_flux_derivative_arrays(params)
@@ -96,7 +159,7 @@ def test_compute_flux_derivatives(ctx_factory):
 def test_compute_flux_derivatives_gpu(ctx_factory):
     queue = device.get_queue(ctx_factory)
     prg = program.get_weno()
-    prg = transform.compute_flux_derivative_gpu(queue, prg)
+    prg = transform_compute_flux_derivative_gpu(queue, prg)
 
     params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup.random_flux_derivative_arrays_on_device(ctx_factory, params)
-- 
GitLab


From a4df56a052e0f5d01cf29844d4d9f8538096363f Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 12:11:21 -0500
Subject: [PATCH 04/21] move program/device fixtures inside test.py

---
 test.py | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/test.py b/test.py
index 6b42fdd..3db55b6 100644
--- a/test.py
+++ b/test.py
@@ -16,12 +16,43 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-import device_fixtures as device
-import program_fixtures as program
 import setup_fixtures as setup
 import kernel_fixtures as kernel
 
 
+_QUEUE = []
+
+
+def get_queue(ctx_factory):
+    if not _QUEUE:
+        setup_queue(ctx_factory)
+    return _QUEUE[0]
+
+
+def setup_queue(ctx_factory):
+    ctx = ctx_factory()
+    _QUEUE.append(cl.CommandQueue(ctx))
+
+
+_WENO_PRG = []
+
+
+def parse_weno():
+    fn = "WENO.F90"
+
+    with open(fn, "r") as infile:
+        infile_content = infile.read()
+
+    prg = lp.parse_transformed_fortran(infile_content, filename=fn)
+    _WENO_PRG.append(prg)
+
+
+def get_weno_program():
+    if not _WENO_PRG:
+        parse_weno()
+    return _WENO_PRG[0]
+
+
 def compare_arrays(a, b):
     assert a == approx(b)
 
@@ -117,8 +148,8 @@ def transform_compute_flux_derivative_gpu(queue, prg):
     ("2 1,4 1,8 2,12 3,64 11", "12 3,24 3,48 6,75.2 10.6,403.2 37.8", "z")
     ])
 def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
-    queue = device.get_queue(ctx_factory)
-    prg = program.get_weno()
+    queue = get_queue(ctx_factory)
+    prg = get_weno_program()
 
     params = setup.roe_params(nvars=5, ndim=3, direction=direction)
     states = setup.array_from_string(states_str)
@@ -132,8 +163,8 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
 
 
 def test_matvec(ctx_factory):
-    queue = device.get_queue(ctx_factory)
-    prg = program.get_weno()
+    queue = get_queue(ctx_factory)
+    prg = get_weno_program()
 
     a = setup.random_array(10, 10)
     b = setup.random_array(10)
@@ -145,8 +176,8 @@ def test_matvec(ctx_factory):
 
 #@pytest.mark.slow
 def test_compute_flux_derivatives(ctx_factory):
-    queue = device.get_queue(ctx_factory)
-    prg = program.get_weno()
+    queue = get_queue(ctx_factory)
+    prg = get_weno_program()
     prg = transform_compute_flux_derivative_basic(prg)
 
     params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
@@ -157,8 +188,8 @@ def test_compute_flux_derivatives(ctx_factory):
 
 #@pytest.mark.slow
 def test_compute_flux_derivatives_gpu(ctx_factory):
-    queue = device.get_queue(ctx_factory)
-    prg = program.get_weno()
+    queue = get_queue(ctx_factory)
+    prg = get_weno_program()
     prg = transform_compute_flux_derivative_gpu(queue, prg)
 
     params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
-- 
GitLab


From d5afdfbb66d8ceb87b8917d316715d61dc6db5a8 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 12:26:51 -0500
Subject: [PATCH 05/21] put setup fixtures in test.py

---
 test.py | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 143 insertions(+), 11 deletions(-)

diff --git a/test.py b/test.py
index 3db55b6..dbc6261 100644
--- a/test.py
+++ b/test.py
@@ -16,7 +16,6 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-import setup_fixtures as setup
 import kernel_fixtures as kernel
 
 
@@ -53,6 +52,139 @@ def get_weno_program():
     return _WENO_PRG[0]
 
 
+class RoeParams:
+    def __init__(self, nvars, ndim, d):
+        self.nvars = nvars
+        self.ndim = ndim
+        self.d = d
+
+    def mat_bounds(self):
+        return self.nvars, self.nvars
+
+    def vec_bounds(self):
+        return self.nvars
+
+
+class FluxDerivativeParams:
+    def __init__(self, nvars, ndim, nx, ny, nz):
+        self.nvars = nvars
+        self.ndim = ndim
+
+        self.nx = nx
+        self.ny = ny
+        self.nz = nz
+
+        self.nhalo = 3
+        self.nx_halo = self.nx + 2*self.nhalo
+        self.ny_halo = self.ny + 2*self.nhalo
+        self.nz_halo = self.nz + 2*self.nhalo
+
+    def state_bounds(self):
+        return self.nvars, self.nx_halo, self.ny_halo, self.nz_halo
+
+    def flux_bounds(self):
+        return self.nvars, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
+
+    def metric_bounds(self):
+        return self.ndim, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
+
+    def jacobian_bounds(self):
+        return self.nx_halo, self.ny_halo, self.nz_halo
+
+
+class FluxDerivativeArrays:
+    def __init__(self, states, fluxes, metrics, metric_jacobians):
+        self.states = states
+        self.fluxes = fluxes
+        self.metrics = metrics
+        self.metric_jacobians = metric_jacobians
+
+
+def setup_roe_params(nvars, ndim, direction):
+    dirs = {"x" : 1, "y" : 2, "z" : 3}
+    return RoeParams(nvars, ndim, dirs[direction])
+
+
+def setup_flux_derivative_params(nvars, ndim, n):
+    return FluxDerivativeParams(nvars, ndim, n, n, n)
+
+
+def setup_empty_array_on_device(queue, shape):
+    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
+
+
+def setup_identity(n):
+    return np.identity(n).astype(np.float32).copy(order="F")
+
+
+def setup_random_array(*shape):
+    return np.random.random_sample(shape).astype(np.float32).copy(order="F")
+
+
+def setup_random_array_on_device(queue, *shape):
+    return cl.array.to_device(queue, setup_random_array(*shape))
+
+
+def setup_random_flux_derivative_arrays(params):
+    states = setup_random_array(*params.state_bounds())
+    fluxes = setup_random_array(*params.flux_bounds())
+    metrics = setup_random_array(*params.metric_bounds())
+    metric_jacobians = setup_random_array(*params.jacobian_bounds())
+
+    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
+
+
+def setup_random_flux_derivative_arrays_on_device(ctx_factory, params):
+    queue = get_queue(ctx_factory)
+
+    states = setup_random_array_on_device(queue, *params.state_bounds())
+    fluxes = setup_random_array_on_device(queue, *params.flux_bounds())
+    metrics = setup_random_array_on_device(queue, *params.metric_bounds())
+    metric_jacobians = setup_random_array_on_device(queue, *params.jacobian_bounds())
+
+    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
+
+
+def arrays_from_string(string_arrays):
+    return split_map_to_list(string_arrays, array_from_string, ":")
+
+
+def array_from_string(string_array):
+    if ";" not in string_array:
+        if "," not in string_array:
+            array = array_from_string_1d(string_array)
+        else:
+            array = array_from_string_2d(string_array)
+    else:
+        array = array_from_string_3d(string_array)
+    return array.copy(order="F")
+
+
+def array_from_string_3d(string_array):
+    if string_array[0] == ";":
+        return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1))
+    else:
+        return np.array(split_map_to_list(string_array, array_from_string_2d, ";"))
+
+
+def array_from_string_2d(string_array):
+    if string_array[0] == ",":
+        return array_from_string_1d(string_array[1:]).reshape((-1, 1))
+    else:
+        return np.array(split_map_to_list(string_array, array_from_string_1d, ","))
+
+
+def array_from_string_1d(string_array):
+    if string_array[0] == "i":
+        return np.array(split_map_to_list(string_array[1:], int, " "))
+    else:
+        return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32)
+
+
+def split_map_to_list(string, map_func, splitter):
+    return list(map(map_func, string.split(splitter)))
+
+
 def compare_arrays(a, b):
     assert a == approx(b)
 
@@ -151,14 +283,14 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
     queue = get_queue(ctx_factory)
     prg = get_weno_program()
 
-    params = setup.roe_params(nvars=5, ndim=3, direction=direction)
-    states = setup.array_from_string(states_str)
-    metrics_frozen = setup.identity(params.ndim)
+    params = setup_roe_params(nvars=5, ndim=3, direction=direction)
+    states = array_from_string(states_str)
+    metrics_frozen = setup_identity(params.ndim)
     R, Rinv, lam = kernel.roe_eigensystem(queue, prg, params, states, metrics_frozen)
 
     compare_roe_identity(states, R, Rinv)
 
-    fluxes = setup.array_from_string(fluxes_str)
+    fluxes = array_from_string(fluxes_str)
     compare_roe_property(states, fluxes, R, Rinv, lam)
 
 
@@ -166,8 +298,8 @@ def test_matvec(ctx_factory):
     queue = get_queue(ctx_factory)
     prg = get_weno_program()
 
-    a = setup.random_array(10, 10)
-    b = setup.random_array(10)
+    a = setup_random_array(10, 10)
+    b = setup_random_array(10)
 
     c = kernel.mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
 
@@ -180,8 +312,8 @@ def test_compute_flux_derivatives(ctx_factory):
     prg = get_weno_program()
     prg = transform_compute_flux_derivative_basic(prg)
 
-    params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
-    arrays = setup.random_flux_derivative_arrays(params)
+    params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
+    arrays = setup_random_flux_derivative_arrays(params)
 
     kernel.compute_flux_derivatives(queue, prg, params, arrays)
 
@@ -192,8 +324,8 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     prg = get_weno_program()
     prg = transform_compute_flux_derivative_gpu(queue, prg)
 
-    params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
-    arrays = setup.random_flux_derivative_arrays_on_device(ctx_factory, params)
+    params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
+    arrays = setup_random_flux_derivative_arrays_on_device(ctx_factory, params)
 
     kernel.compute_flux_derivatives(queue, prg, params, arrays)
 
-- 
GitLab


From 2797f50236c512939a604175a6e7572ec1303f72 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Mon, 17 Jun 2019 12:33:37 -0500
Subject: [PATCH 06/21] move kernel fixtures into test.py

---
 test.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/test.py b/test.py
index dbc6261..17e7f8f 100644
--- a/test.py
+++ b/test.py
@@ -16,8 +16,6 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-import kernel_fixtures as kernel
-
 
 _QUEUE = []
 
@@ -185,6 +183,52 @@ def split_map_to_list(string, map_func, splitter):
     return list(map(map_func, string.split(splitter)))
 
 
+def with_root_kernel(prg, root_name):
+    # FIXME This is a little less beautiful than it could be
+    new_prg = prg.copy(name=root_name)
+    for name in prg:
+        clbl = new_prg[name]
+        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
+            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
+
+    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
+    return new_prg
+
+
+def kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen):
+    R_dev = setup_empty_array_on_device(queue, params.mat_bounds())
+    Rinv_dev = setup_empty_array_on_device(queue, params.mat_bounds())
+    lam_dev = setup_empty_array_on_device(queue, params.vec_bounds())
+
+    prg = with_root_kernel(prg, "roe_eigensystem")
+    prg(queue, nvars=params.nvars, ndim=params.ndim, d=params.d,
+            states=states, metrics_frozen=metrics_frozen,
+            R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev)
+
+    return R_dev.get(), Rinv_dev.get(), lam_dev.get()
+
+
+def kernel_mult_mat_vec(queue, prg, alpha, a, b):
+    c_dev = setup_empty_array_on_device(queue, b.shape)
+
+    prg = with_root_kernel(prg, "mult_mat_vec")
+    prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
+
+    return c_dev.get()
+
+
+def kernel_compute_flux_derivatives(queue, prg, params, arrays):
+    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
+        params.nx_halo, params.ny_halo, params.nz_halo))
+
+    prg(queue, nvars=params.nvars, ndim=params.ndim,
+            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
+            metric_jacobians=arrays.metric_jacobians,
+            flux_derivatives=flux_derivatives_dev)
+
+    return flux_derivatives_dev.get()
+
+
 def compare_arrays(a, b):
     assert a == approx(b)
 
@@ -286,7 +330,7 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
     params = setup_roe_params(nvars=5, ndim=3, direction=direction)
     states = array_from_string(states_str)
     metrics_frozen = setup_identity(params.ndim)
-    R, Rinv, lam = kernel.roe_eigensystem(queue, prg, params, states, metrics_frozen)
+    R, Rinv, lam = kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen)
 
     compare_roe_identity(states, R, Rinv)
 
@@ -301,7 +345,7 @@ def test_matvec(ctx_factory):
     a = setup_random_array(10, 10)
     b = setup_random_array(10)
 
-    c = kernel.mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
+    c = kernel_mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
 
     compare_arrays(a@b, c)
 
@@ -315,7 +359,7 @@ def test_compute_flux_derivatives(ctx_factory):
     params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup_random_flux_derivative_arrays(params)
 
-    kernel.compute_flux_derivatives(queue, prg, params, arrays)
+    kernel_compute_flux_derivatives(queue, prg, params, arrays)
 
 
 #@pytest.mark.slow
@@ -327,7 +371,7 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup_random_flux_derivative_arrays_on_device(ctx_factory, params)
 
-    kernel.compute_flux_derivatives(queue, prg, params, arrays)
+    kernel_compute_flux_derivatives(queue, prg, params, arrays)
 
 
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.
-- 
GitLab


From 59267775b2d303af5f6882704a511d7cec770e64 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Wed, 19 Jun 2019 21:34:20 -0500
Subject: [PATCH 07/21] create utilities.py for all utilities, remove fixture
 files, update benchmark script to use utilities

---
 benchmark.py                          |  28 ++++--
 device_fixtures.py                    |  15 ---
 kernel_fixtures.py                    |  49 ---------
 program_fixtures.py                   |  20 ----
 setup_fixtures.py                     | 138 --------------------------
 test.py                               |  98 +-----------------
 transform_fixtures.py => utilities.py |  51 ++++++++--
 7 files changed, 63 insertions(+), 336 deletions(-)
 delete mode 100644 device_fixtures.py
 delete mode 100644 kernel_fixtures.py
 delete mode 100644 program_fixtures.py
 delete mode 100644 setup_fixtures.py
 rename transform_fixtures.py => utilities.py (67%)

diff --git a/benchmark.py b/benchmark.py
index 00034a7..444b689 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -14,18 +14,24 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-import device_fixtures as device
-import program_fixtures as program
-import transform_fixtures as transform
-import setup_fixtures as setup
+from utilities import *
+
+
+def setup_random_array(*shape):
+    return np.random.random_sample(shape).astype(np.float32).copy(order="F")
+
+
+def setup_random_array_on_device(queue, *shape):
+    return cl.array.to_device(queue, setup_random_array(*shape))
+
 
 def benchmark_compute_flux_derivatives_gpu(ctx_factory):
     logging.basicConfig(level="INFO")
 
-    prg = program.get_weno()
-    prg = transform.weno_for_gpu(prg)
+    prg = get_weno_program()
+    prg = transform_weno_for_gpu(prg)
 
-    queue = device.get_queue(ctx_factory)
+    queue = get_queue(ctx_factory)
 
     ndim = 3
     nvars = 5
@@ -35,10 +41,10 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
     nz = n
 
     print("ARRAY GEN")
-    states = setup.random_array_on_device(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = setup.random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = setup.random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = setup.random_array_on_device(queue, nx+6, ny+6, nz+6)
+    states = setup_random_array_on_device(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = setup_random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = setup_random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = setup_random_array_on_device(queue, nx+6, ny+6, nz+6)
     print("END ARRAY GEN")
 
     flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
diff --git a/device_fixtures.py b/device_fixtures.py
deleted file mode 100644
index d0dbc59..0000000
--- a/device_fixtures.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import pyopencl as cl
-
-
-_QUEUE = []
-
-
-def get_queue(ctx_factory):
-    if not _QUEUE:
-        setup_queue(ctx_factory)
-    return _QUEUE[0]
-
-
-def setup_queue(ctx_factory):
-    ctx = ctx_factory()
-    _QUEUE.append(cl.CommandQueue(ctx))
diff --git a/kernel_fixtures.py b/kernel_fixtures.py
deleted file mode 100644
index 7f3dff4..0000000
--- a/kernel_fixtures.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import loopy as lp  # noqa
-
-import setup_fixtures as setup
-
-
-def with_root_kernel(prg, root_name):
-    # FIXME This is a little less beautiful than it could be
-    new_prg = prg.copy(name=root_name)
-    for name in prg:
-        clbl = new_prg[name]
-        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
-            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
-
-    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
-    return new_prg
-
-
-def roe_eigensystem(queue, prg, params, states, metrics_frozen):
-    R_dev = setup.empty_array_on_device(queue, params.mat_bounds())
-    Rinv_dev = setup.empty_array_on_device(queue, params.mat_bounds())
-    lam_dev = setup.empty_array_on_device(queue, params.vec_bounds())
-
-    prg = with_root_kernel(prg, "roe_eigensystem")
-    prg(queue, nvars=params.nvars, ndim=params.ndim, d=params.d,
-            states=states, metrics_frozen=metrics_frozen,
-            R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev)
-
-    return R_dev.get(), Rinv_dev.get(), lam_dev.get()
-
-
-def mult_mat_vec(queue, prg, alpha, a, b):
-    c_dev = setup.empty_array_on_device(queue, b.shape)
-
-    prg = with_root_kernel(prg, "mult_mat_vec")
-    prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
-
-    return c_dev.get()
-
-
-def compute_flux_derivatives(queue, prg, params, arrays):
-    flux_derivatives_dev = setup.empty_array_on_device(queue, (params.nvars, params.ndim,
-        params.nx_halo, params.ny_halo, params.nz_halo))
-
-    prg(queue, nvars=params.nvars, ndim=params.ndim,
-            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
-            metric_jacobians=arrays.metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
-
-    return flux_derivatives_dev.get()
diff --git a/program_fixtures.py b/program_fixtures.py
deleted file mode 100644
index 0f50ff1..0000000
--- a/program_fixtures.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import loopy as lp
-
-
-_WENO_PRG = []
-
-
-def parse_weno():
-    fn = "WENO.F90"
-
-    with open(fn, "r") as infile:
-        infile_content = infile.read()
-
-    prg = lp.parse_transformed_fortran(infile_content, filename=fn)
-    _WENO_PRG.append(prg)
-
-
-def get_weno():
-    if not _WENO_PRG:
-        parse_weno()
-    return _WENO_PRG[0]
diff --git a/setup_fixtures.py b/setup_fixtures.py
deleted file mode 100644
index 6f1debc..0000000
--- a/setup_fixtures.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import numpy as np
-import pyopencl as cl
-import pyopencl.array  # noqa
-
-import device_fixtures as device
-
-
-class RoeParams:
-    def __init__(self, nvars, ndim, d):
-        self.nvars = nvars
-        self.ndim = ndim
-        self.d = d
-
-    def mat_bounds(self):
-        return self.nvars, self.nvars
-
-    def vec_bounds(self):
-        return self.nvars
-
-
-class FluxDerivativeParams:
-    def __init__(self, nvars, ndim, nx, ny, nz):
-        self.nvars = nvars
-        self.ndim = ndim
-
-        self.nx = nx
-        self.ny = ny
-        self.nz = nz
-
-        self.nhalo = 3
-        self.nx_halo = self.nx + 2*self.nhalo
-        self.ny_halo = self.ny + 2*self.nhalo
-        self.nz_halo = self.nz + 2*self.nhalo
-
-    def state_bounds(self):
-        return self.nvars, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def flux_bounds(self):
-        return self.nvars, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def metric_bounds(self):
-        return self.ndim, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def jacobian_bounds(self):
-        return self.nx_halo, self.ny_halo, self.nz_halo
-
-
-class FluxDerivativeArrays:
-    def __init__(self, states, fluxes, metrics, metric_jacobians):
-        self.states = states
-        self.fluxes = fluxes
-        self.metrics = metrics
-        self.metric_jacobians = metric_jacobians
-
-
-def roe_params(nvars, ndim, direction):
-    dirs = {"x" : 1, "y" : 2, "z" : 3}
-    return RoeParams(nvars, ndim, dirs[direction])
-
-
-def flux_derivative_params(nvars, ndim, n):
-    return FluxDerivativeParams(nvars, ndim, n, n, n)
-
-
-def empty_array_on_device(queue, shape):
-    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
-
-
-def identity(n):
-    return np.identity(n).astype(np.float32).copy(order="F")
-
-
-def random_array(*shape):
-    return np.random.random_sample(shape).astype(np.float32).copy(order="F")
-
-
-def random_array_on_device(queue, *shape):
-    return cl.array.to_device(queue, random_array(*shape))
-
-
-def random_flux_derivative_arrays(params):
-    states = random_array(*params.state_bounds())
-    fluxes = random_array(*params.flux_bounds())
-    metrics = random_array(*params.metric_bounds())
-    metric_jacobians = random_array(*params.jacobian_bounds())
-
-    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
-
-
-def random_flux_derivative_arrays_on_device(ctx_factory, params):
-    queue = device.get_queue(ctx_factory)
-
-    states = random_array_on_device(queue, *params.state_bounds())
-    fluxes = random_array_on_device(queue, *params.flux_bounds())
-    metrics = random_array_on_device(queue, *params.metric_bounds())
-    metric_jacobians = random_array_on_device(queue, *params.jacobian_bounds())
-
-    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
-
-
-def arrays_from_string(string_arrays):
-    return split_map_to_list(string_arrays, array_from_string, ":")
-
-
-def array_from_string(string_array):
-    if ";" not in string_array:
-        if "," not in string_array:
-            array = array_from_string_1d(string_array)
-        else:
-            array = array_from_string_2d(string_array)
-    else:
-        array = array_from_string_3d(string_array)
-    return array.copy(order="F")
-
-
-def array_from_string_3d(string_array):
-    if string_array[0] == ";":
-        return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1))
-    else:
-        return np.array(split_map_to_list(string_array, array_from_string_2d, ";"))
-
-
-def array_from_string_2d(string_array):
-    if string_array[0] == ",":
-        return array_from_string_1d(string_array[1:]).reshape((-1, 1))
-    else:
-        return np.array(split_map_to_list(string_array, array_from_string_1d, ","))
-
-
-def array_from_string_1d(string_array):
-    if string_array[0] == "i":
-        return np.array(split_map_to_list(string_array[1:], int, " "))
-    else:
-        return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32)
-
-
-def split_map_to_list(string, map_func, splitter):
-    return list(map(map_func, string.split(splitter)))
diff --git a/test.py b/test.py
index 17e7f8f..514ba6f 100644
--- a/test.py
+++ b/test.py
@@ -11,43 +11,11 @@ import logging
 
 import pytest
 from pytest import approx
-import pyopencl as cl
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 
-
-_QUEUE = []
-
-
-def get_queue(ctx_factory):
-    if not _QUEUE:
-        setup_queue(ctx_factory)
-    return _QUEUE[0]
-
-
-def setup_queue(ctx_factory):
-    ctx = ctx_factory()
-    _QUEUE.append(cl.CommandQueue(ctx))
-
-
-_WENO_PRG = []
-
-
-def parse_weno():
-    fn = "WENO.F90"
-
-    with open(fn, "r") as infile:
-        infile_content = infile.read()
-
-    prg = lp.parse_transformed_fortran(infile_content, filename=fn)
-    _WENO_PRG.append(prg)
-
-
-def get_weno_program():
-    if not _WENO_PRG:
-        parse_weno()
-    return _WENO_PRG[0]
+from utilities import *
 
 
 class RoeParams:
@@ -247,70 +215,6 @@ def compare_roe_property(states, fluxes, R, Rinv, lam):
     compare_arrays(R@temp, dFlux)
 
 
-def transform_compute_flux_derivative_basic(prg):
-    cfd = prg["compute_flux_derivatives"]
-
-    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
-
-    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-            lp.AddressSpace.GLOBAL)
-
-    return prg.with_kernel(cfd)
-
-
-def transform_weno_for_gpu(prg):
-    prg = transform_compute_flux_derivative_basic(prg)
-
-    cfd = prg["compute_flux_derivatives"]
-
-    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
-        cfd = lp.split_iname(cfd, "i"+suffix, 16,
-                outer_tag="g.0", inner_tag="l.0")
-        cfd = lp.split_iname(cfd, "j"+suffix, 16,
-                outer_tag="g.1", inner_tag="l.1")
-
-    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
-        cfd = lp.assignment_to_subst(cfd, var_name)
-
-    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
-    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")
-
-    prg = prg.with_kernel(cfd)
-
-    # FIXME: These should work, but don't
-    # FIXME: Undo the hand-inlining in WENO.F90
-    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
-    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
-
-    if 0:
-        print(prg["convert_to_generalized_frozen"])
-        1/0
-
-    return prg
-
-
-def transform_compute_flux_derivative_gpu(queue, prg):
-    prg = transform_weno_for_gpu(prg)
-
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 1:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
-    prg = lp.set_options(prg, no_numpy=True)
-    return prg
-
-
 @pytest.mark.xfail
 @pytest.mark.parametrize("states_str,fluxes_str,direction", [
     ("2 1,4 1,4 1,4 1,20 5.5", "4 1,11.2 2.6,8 1,8 1,46.4 7.1", "x"),
diff --git a/transform_fixtures.py b/utilities.py
similarity index 67%
rename from transform_fixtures.py
rename to utilities.py
index f69581a..a188dce 100644
--- a/transform_fixtures.py
+++ b/utilities.py
@@ -1,7 +1,46 @@
-import loopy as lp
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pyopencl.tools  # noqa
+import pyopencl.clrandom  # noqa
+import loopy as lp  # noqa
 
 
-def compute_flux_derivative_basic(prg):
+_QUEUE = []
+
+
+def get_queue(ctx_factory):
+    if not _QUEUE:
+        setup_queue(ctx_factory)
+    return _QUEUE[0]
+
+
+def setup_queue(ctx_factory):
+    ctx = ctx_factory()
+    _QUEUE.append(cl.CommandQueue(ctx))
+
+
+_WENO_PRG = []
+
+
+def parse_weno():
+    fn = "WENO.F90"
+
+    with open(fn, "r") as infile:
+        infile_content = infile.read()
+
+    prg = lp.parse_transformed_fortran(infile_content, filename=fn)
+    _WENO_PRG.append(prg)
+
+
+def get_weno_program():
+    if not _WENO_PRG:
+        parse_weno()
+    return _WENO_PRG[0]
+
+
+def transform_compute_flux_derivative_basic(prg):
     cfd = prg["compute_flux_derivatives"]
 
     cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
@@ -16,8 +55,8 @@ def compute_flux_derivative_basic(prg):
     return prg.with_kernel(cfd)
 
 
-def weno_for_gpu(prg):
-    prg = compute_flux_derivative_basic(prg)
+def transform_weno_for_gpu(prg):
+    prg = transform_compute_flux_derivative_basic(prg)
 
     cfd = prg["compute_flux_derivatives"]
 
@@ -52,8 +91,8 @@ def weno_for_gpu(prg):
     return prg
 
 
-def compute_flux_derivative_gpu(queue, prg):
-    prg = weno_for_gpu(prg)
+def transform_compute_flux_derivative_gpu(queue, prg):
+    prg = transform_weno_for_gpu(prg)
 
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
-- 
GitLab


From 68f74b816c6f6e489a3d348cd34153c2cf240942 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Wed, 19 Jun 2019 22:08:12 -0500
Subject: [PATCH 08/21] move benchmark array generation to test-local functions

---
 benchmark.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index 444b689..b48c19c 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -17,15 +17,13 @@ from pyopencl.tools import (  # noqa
 from utilities import *
 
 
-def setup_random_array(*shape):
-    return np.random.random_sample(shape).astype(np.float32).copy(order="F")
-
-
-def setup_random_array_on_device(queue, *shape):
-    return cl.array.to_device(queue, setup_random_array(*shape))
-
-
 def benchmark_compute_flux_derivatives_gpu(ctx_factory):
+    def random_array_on_device(queue, *shape):
+        return cl.array.to_device(queue, random_array(*shape))
+    
+    def random_array(*shape):
+        return np.random.random_sample(shape).astype(np.float32).copy(order="F")
+
     logging.basicConfig(level="INFO")
 
     prg = get_weno_program()
@@ -41,10 +39,10 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
     nz = n
 
     print("ARRAY GEN")
-    states = setup_random_array_on_device(queue, nvars, nx+6, ny+6, nz+6)
-    fluxes = setup_random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
-    metrics = setup_random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6)
-    metric_jacobians = setup_random_array_on_device(queue, nx+6, ny+6, nz+6)
+    states = random_array_on_device(queue, nvars, nx+6, ny+6, nz+6)
+    fluxes = random_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
+    metrics = random_array_on_device(queue, ndim, ndim, nx+6, ny+6, nz+6)
+    metric_jacobians = random_array_on_device(queue, nx+6, ny+6, nz+6)
     print("END ARRAY GEN")
 
     flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-- 
GitLab


From 13e2a49d25d7c6d892934e77e374b0003395b672 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 09:17:57 -0500
Subject: [PATCH 09/21] refactor out kernel_compute_flux_derivatives interface
 function

---
 test.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/test.py b/test.py
index 514ba6f..75b6e16 100644
--- a/test.py
+++ b/test.py
@@ -185,18 +185,6 @@ def kernel_mult_mat_vec(queue, prg, alpha, a, b):
     return c_dev.get()
 
 
-def kernel_compute_flux_derivatives(queue, prg, params, arrays):
-    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
-        params.nx_halo, params.ny_halo, params.nz_halo))
-
-    prg(queue, nvars=params.nvars, ndim=params.ndim,
-            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
-            metric_jacobians=arrays.metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
-
-    return flux_derivatives_dev.get()
-
-
 def compare_arrays(a, b):
     assert a == approx(b)
 
@@ -263,7 +251,13 @@ def test_compute_flux_derivatives(ctx_factory):
     params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup_random_flux_derivative_arrays(params)
 
-    kernel_compute_flux_derivatives(queue, prg, params, arrays)
+    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
+        params.nx_halo, params.ny_halo, params.nz_halo))
+
+    prg(queue, nvars=params.nvars, ndim=params.ndim,
+            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
+            metric_jacobians=arrays.metric_jacobians,
+            flux_derivatives=flux_derivatives_dev)
 
 
 #@pytest.mark.slow
@@ -275,7 +269,13 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
     arrays = setup_random_flux_derivative_arrays_on_device(ctx_factory, params)
 
-    kernel_compute_flux_derivatives(queue, prg, params, arrays)
+    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
+        params.nx_halo, params.ny_halo, params.nz_halo))
+
+    prg(queue, nvars=params.nvars, ndim=params.ndim,
+            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
+            metric_jacobians=arrays.metric_jacobians,
+            flux_derivatives=flux_derivatives_dev)
 
 
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.
-- 
GitLab


From b8979ede9f64ef8d23d39453292f9157c225a3e3 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 10:19:57 -0500
Subject: [PATCH 10/21] refactor test_compute_flux_derivatives to use
 lp.auto_test_vs_ref

---
 test.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/test.py b/test.py
index 75b6e16..2e810e9 100644
--- a/test.py
+++ b/test.py
@@ -244,20 +244,11 @@ def test_matvec(ctx_factory):
 
 #@pytest.mark.slow
 def test_compute_flux_derivatives(ctx_factory):
-    queue = get_queue(ctx_factory)
     prg = get_weno_program()
     prg = transform_compute_flux_derivative_basic(prg)
 
-    params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
-    arrays = setup_random_flux_derivative_arrays(params)
-
-    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
-        params.nx_halo, params.ny_halo, params.nz_halo))
-
-    prg(queue, nvars=params.nvars, ndim=params.ndim,
-            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
-            metric_jacobians=arrays.metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
+    lp.auto_test_vs_ref(prg, ctx_factory(),
+            parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
 #@pytest.mark.slow
-- 
GitLab


From 2ac94b6fddb6726b63130127f25c623df4144f5e Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 10:31:40 -0500
Subject: [PATCH 11/21] refactor test_compute_flux_derivatives to use
 lp.auto_test_vs_ref

---
 test.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/test.py b/test.py
index 2e810e9..2b694fb 100644
--- a/test.py
+++ b/test.py
@@ -253,20 +253,11 @@ def test_compute_flux_derivatives(ctx_factory):
 
 #@pytest.mark.slow
 def test_compute_flux_derivatives_gpu(ctx_factory):
-    queue = get_queue(ctx_factory)
     prg = get_weno_program()
-    prg = transform_compute_flux_derivative_gpu(queue, prg)
-
-    params = setup_flux_derivative_params(ndim=3, nvars=5, n=10)
-    arrays = setup_random_flux_derivative_arrays_on_device(ctx_factory, params)
-
-    flux_derivatives_dev = setup_empty_array_on_device(queue, (params.nvars, params.ndim,
-        params.nx_halo, params.ny_halo, params.nz_halo))
+    prg = transform_compute_flux_derivative_gpu(get_queue(ctx_factory), prg)
 
-    prg(queue, nvars=params.nvars, ndim=params.ndim,
-            states=arrays.states, fluxes=arrays.fluxes, metrics=arrays.metrics,
-            metric_jacobians=arrays.metric_jacobians,
-            flux_derivatives=flux_derivatives_dev)
+    lp.auto_test_vs_ref(prg, ctx_factory(),
+            parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
 # This lets you run 'python test.py test_case(cl._csc)' without pytest.
-- 
GitLab


From c2d76477b408a2cc7f7276fe41e44ca7db541d5b Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 11:00:12 -0500
Subject: [PATCH 12/21] heavy reorganization to put things in utilities.py or
 as test-local functions

---
 benchmark.py |   6 --
 test.py      | 249 ++++++++++++---------------------------------------
 utilities.py |  63 +++++++++++++
 3 files changed, 122 insertions(+), 196 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index b48c19c..f65cd58 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -18,12 +18,6 @@ from utilities import *
 
 
 def benchmark_compute_flux_derivatives_gpu(ctx_factory):
-    def random_array_on_device(queue, *shape):
-        return cl.array.to_device(queue, random_array(*shape))
-    
-    def random_array(*shape):
-        return np.random.random_sample(shape).astype(np.float32).copy(order="F")
-
     logging.basicConfig(level="INFO")
 
     prg = get_weno_program()
diff --git a/test.py b/test.py
index 2b694fb..7d84ace 100644
--- a/test.py
+++ b/test.py
@@ -18,191 +18,6 @@ from pyopencl.tools import (  # noqa
 from utilities import *
 
 
-class RoeParams:
-    def __init__(self, nvars, ndim, d):
-        self.nvars = nvars
-        self.ndim = ndim
-        self.d = d
-
-    def mat_bounds(self):
-        return self.nvars, self.nvars
-
-    def vec_bounds(self):
-        return self.nvars
-
-
-class FluxDerivativeParams:
-    def __init__(self, nvars, ndim, nx, ny, nz):
-        self.nvars = nvars
-        self.ndim = ndim
-
-        self.nx = nx
-        self.ny = ny
-        self.nz = nz
-
-        self.nhalo = 3
-        self.nx_halo = self.nx + 2*self.nhalo
-        self.ny_halo = self.ny + 2*self.nhalo
-        self.nz_halo = self.nz + 2*self.nhalo
-
-    def state_bounds(self):
-        return self.nvars, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def flux_bounds(self):
-        return self.nvars, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def metric_bounds(self):
-        return self.ndim, self.ndim, self.nx_halo, self.ny_halo, self.nz_halo
-
-    def jacobian_bounds(self):
-        return self.nx_halo, self.ny_halo, self.nz_halo
-
-
-class FluxDerivativeArrays:
-    def __init__(self, states, fluxes, metrics, metric_jacobians):
-        self.states = states
-        self.fluxes = fluxes
-        self.metrics = metrics
-        self.metric_jacobians = metric_jacobians
-
-
-def setup_roe_params(nvars, ndim, direction):
-    dirs = {"x" : 1, "y" : 2, "z" : 3}
-    return RoeParams(nvars, ndim, dirs[direction])
-
-
-def setup_flux_derivative_params(nvars, ndim, n):
-    return FluxDerivativeParams(nvars, ndim, n, n, n)
-
-
-def setup_empty_array_on_device(queue, shape):
-    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
-
-
-def setup_identity(n):
-    return np.identity(n).astype(np.float32).copy(order="F")
-
-
-def setup_random_array(*shape):
-    return np.random.random_sample(shape).astype(np.float32).copy(order="F")
-
-
-def setup_random_array_on_device(queue, *shape):
-    return cl.array.to_device(queue, setup_random_array(*shape))
-
-
-def setup_random_flux_derivative_arrays(params):
-    states = setup_random_array(*params.state_bounds())
-    fluxes = setup_random_array(*params.flux_bounds())
-    metrics = setup_random_array(*params.metric_bounds())
-    metric_jacobians = setup_random_array(*params.jacobian_bounds())
-
-    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
-
-
-def setup_random_flux_derivative_arrays_on_device(ctx_factory, params):
-    queue = get_queue(ctx_factory)
-
-    states = setup_random_array_on_device(queue, *params.state_bounds())
-    fluxes = setup_random_array_on_device(queue, *params.flux_bounds())
-    metrics = setup_random_array_on_device(queue, *params.metric_bounds())
-    metric_jacobians = setup_random_array_on_device(queue, *params.jacobian_bounds())
-
-    return FluxDerivativeArrays(states, fluxes, metrics, metric_jacobians)
-
-
-def arrays_from_string(string_arrays):
-    return split_map_to_list(string_arrays, array_from_string, ":")
-
-
-def array_from_string(string_array):
-    if ";" not in string_array:
-        if "," not in string_array:
-            array = array_from_string_1d(string_array)
-        else:
-            array = array_from_string_2d(string_array)
-    else:
-        array = array_from_string_3d(string_array)
-    return array.copy(order="F")
-
-
-def array_from_string_3d(string_array):
-    if string_array[0] == ";":
-        return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1))
-    else:
-        return np.array(split_map_to_list(string_array, array_from_string_2d, ";"))
-
-
-def array_from_string_2d(string_array):
-    if string_array[0] == ",":
-        return array_from_string_1d(string_array[1:]).reshape((-1, 1))
-    else:
-        return np.array(split_map_to_list(string_array, array_from_string_1d, ","))
-
-
-def array_from_string_1d(string_array):
-    if string_array[0] == "i":
-        return np.array(split_map_to_list(string_array[1:], int, " "))
-    else:
-        return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32)
-
-
-def split_map_to_list(string, map_func, splitter):
-    return list(map(map_func, string.split(splitter)))
-
-
-def with_root_kernel(prg, root_name):
-    # FIXME This is a little less beautiful than it could be
-    new_prg = prg.copy(name=root_name)
-    for name in prg:
-        clbl = new_prg[name]
-        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
-            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
-
-    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
-    return new_prg
-
-
-def kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen):
-    R_dev = setup_empty_array_on_device(queue, params.mat_bounds())
-    Rinv_dev = setup_empty_array_on_device(queue, params.mat_bounds())
-    lam_dev = setup_empty_array_on_device(queue, params.vec_bounds())
-
-    prg = with_root_kernel(prg, "roe_eigensystem")
-    prg(queue, nvars=params.nvars, ndim=params.ndim, d=params.d,
-            states=states, metrics_frozen=metrics_frozen,
-            R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev)
-
-    return R_dev.get(), Rinv_dev.get(), lam_dev.get()
-
-
-def kernel_mult_mat_vec(queue, prg, alpha, a, b):
-    c_dev = setup_empty_array_on_device(queue, b.shape)
-
-    prg = with_root_kernel(prg, "mult_mat_vec")
-    prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
-
-    return c_dev.get()
-
-
-def compare_arrays(a, b):
-    assert a == approx(b)
-
-
-def compare_roe_identity(states, R, Rinv):
-    dState = states[:,1] - states[:,0]
-    compare_arrays(R@(Rinv@dState), dState)
-
-
-def compare_roe_property(states, fluxes, R, Rinv, lam):
-    dState = states[:,1] - states[:,0]
-    dFlux = fluxes[:,1] - fluxes[:,0]
-
-    temp = Rinv@dState
-    temp = np.multiply(lam, temp)
-    compare_arrays(R@temp, dFlux)
-
-
 @pytest.mark.xfail
 @pytest.mark.parametrize("states_str,fluxes_str,direction", [
     ("2 1,4 1,4 1,4 1,20 5.5", "4 1,11.2 2.6,8 1,8 1,46.4 7.1", "x"),
@@ -216,26 +31,80 @@ def compare_roe_property(states, fluxes, R, Rinv, lam):
     ("2 1,4 1,8 2,12 3,64 11", "12 3,24 3,48 6,75.2 10.6,403.2 37.8", "z")
     ])
 def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
+    class RoeParams:
+        def __init__(self, nvars, ndim, d):
+            self.nvars = nvars
+            self.ndim = ndim
+            self.d = d
+
+        def mat_bounds(self):
+            return self.nvars, self.nvars
+
+        def vec_bounds(self):
+            return self.nvars
+
+    def setup_roe_params(nvars, ndim, direction):
+        dirs = {"x" : 1, "y" : 2, "z" : 3}
+        return RoeParams(nvars, ndim, dirs[direction])
+
+    def identity_matrix(n):
+        return np.identity(n).astype(np.float32).copy(order="F")
+
+    def kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen):
+        R_dev = empty_array_on_device(queue, params.mat_bounds())
+        Rinv_dev = empty_array_on_device(queue, params.mat_bounds())
+        lam_dev = empty_array_on_device(queue, params.vec_bounds())
+
+        prg = with_root_kernel(prg, "roe_eigensystem")
+        prg(queue, nvars=params.nvars, ndim=params.ndim, d=params.d,
+                states=states, metrics_frozen=metrics_frozen,
+                R=R_dev, R_inv=Rinv_dev, lambda_roe=lam_dev)
+
+        return R_dev.get(), Rinv_dev.get(), lam_dev.get()
+
+    def check_roe_identity(states, R, Rinv):
+        dState = states[:,1] - states[:,0]
+        compare_arrays(R@(Rinv@dState), dState)
+
+    def check_roe_property(states, fluxes, R, Rinv, lam):
+        dState = states[:,1] - states[:,0]
+        dFlux = fluxes[:,1] - fluxes[:,0]
+
+        temp = Rinv@dState
+        temp = np.multiply(lam, temp)
+        compare_arrays(R@temp, dFlux)
+
     queue = get_queue(ctx_factory)
     prg = get_weno_program()
 
     params = setup_roe_params(nvars=5, ndim=3, direction=direction)
     states = array_from_string(states_str)
-    metrics_frozen = setup_identity(params.ndim)
+    metrics_frozen = identity_matrix(params.ndim)
     R, Rinv, lam = kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen)
 
-    compare_roe_identity(states, R, Rinv)
+    check_roe_identity(states, R, Rinv)
 
     fluxes = array_from_string(fluxes_str)
-    compare_roe_property(states, fluxes, R, Rinv, lam)
+    check_roe_property(states, fluxes, R, Rinv, lam)
 
 
 def test_matvec(ctx_factory):
+    def kernel_mult_mat_vec(queue, prg, alpha, a, b):
+        c_dev = empty_array_on_device(queue, b.shape)
+
+        prg = with_root_kernel(prg, "mult_mat_vec")
+        prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
+
+        return c_dev.get()
+
+    def random_array(*shape):
+        return np.random.random_sample(shape).astype(np.float32).copy(order="F")
+
     queue = get_queue(ctx_factory)
     prg = get_weno_program()
 
-    a = setup_random_array(10, 10)
-    b = setup_random_array(10)
+    a = random_array(10, 10)
+    b = random_array(10)
 
     c = kernel_mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
 
diff --git a/utilities.py b/utilities.py
index a188dce..8c942d7 100644
--- a/utilities.py
+++ b/utilities.py
@@ -5,6 +5,69 @@ import pyopencl.array  # noqa
 import pyopencl.tools  # noqa
 import pyopencl.clrandom  # noqa
 import loopy as lp  # noqa
+from pytest import approx
+
+
+def split_map_to_list(string, map_func, splitter):
+    return list(map(map_func, string.split(splitter)))
+
+
+def arrays_from_string(string_arrays):
+    return split_map_to_list(string_arrays, array_from_string, ":")
+
+
+def array_from_string(string_array):
+    def array_from_string_1d(string_array):
+        if string_array[0] == "i":
+            return np.array(split_map_to_list(string_array[1:], int, " "))
+        else:
+            return np.array(split_map_to_list(string_array, float, " "), dtype=np.float32)
+
+    def array_from_string_2d(string_array):
+        if string_array[0] == ",":
+            return array_from_string_1d(string_array[1:]).reshape((-1, 1))
+        else:
+            return np.array(split_map_to_list(string_array, array_from_string_1d, ","))
+
+    def array_from_string_3d(string_array):
+        if string_array[0] == ";":
+            return array_from_string_1d(string_array[1:]).reshape((-1, 1, 1))
+        else:
+            return np.array(split_map_to_list(string_array, array_from_string_2d, ";"))
+
+    if ";" not in string_array:
+        if "," not in string_array:
+            array = array_from_string_1d(string_array)
+        else:
+            array = array_from_string_2d(string_array)
+    else:
+        array = array_from_string_3d(string_array)
+    return array.copy(order="F")
+
+
+def with_root_kernel(prg, root_name):
+    # FIXME This is a little less beautiful than it could be
+    new_prg = prg.copy(name=root_name)
+    for name in prg:
+        clbl = new_prg[name]
+        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
+            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
+
+    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
+    return new_prg
+
+
+def compare_arrays(a, b):
+    assert a == approx(b)
+
+
+def random_array_on_device(queue, *shape):
+    empty = empty_array_on_device(queue, shape)
+    return cl.clrandom.fill_rand(empty)
+
+
+def empty_array_on_device(queue, shape):
+    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
 
 
 _QUEUE = []
-- 
GitLab


From a8f1ddbca390464c3db951c4802168dc9a29e268 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 11:07:21 -0500
Subject: [PATCH 13/21] reorganize order of utility functions

---
 utilities.py | 100 +++++++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 46 deletions(-)

diff --git a/utilities.py b/utilities.py
index 8c942d7..2ada8c7 100644
--- a/utilities.py
+++ b/utilities.py
@@ -8,8 +8,19 @@ import loopy as lp  # noqa
 from pytest import approx
 
 
-def split_map_to_list(string, map_func, splitter):
-    return list(map(map_func, string.split(splitter)))
+### Arrays ###
+
+def compare_arrays(a, b):
+    assert a == approx(b)
+
+
+def random_array_on_device(queue, *shape):
+    empty = empty_array_on_device(queue, shape)
+    return cl.clrandom.fill_rand(empty)
+
+
+def empty_array_on_device(queue, shape):
+    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
 
 
 def arrays_from_string(string_arrays):
@@ -45,30 +56,11 @@ def array_from_string(string_array):
     return array.copy(order="F")
 
 
-def with_root_kernel(prg, root_name):
-    # FIXME This is a little less beautiful than it could be
-    new_prg = prg.copy(name=root_name)
-    for name in prg:
-        clbl = new_prg[name]
-        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
-            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
-
-    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
-    return new_prg
-
-
-def compare_arrays(a, b):
-    assert a == approx(b)
-
-
-def random_array_on_device(queue, *shape):
-    empty = empty_array_on_device(queue, shape)
-    return cl.clrandom.fill_rand(empty)
-
+def split_map_to_list(string, map_func, splitter):
+    return list(map(map_func, string.split(splitter)))
 
-def empty_array_on_device(queue, shape):
-    return cl.array.empty(queue, shape, dtype=np.float32, order="F")
 
+### Device ###
 
 _QUEUE = []
 
@@ -84,9 +76,17 @@ def setup_queue(ctx_factory):
     _QUEUE.append(cl.CommandQueue(ctx))
 
 
+### Program / Kernel ###
+
 _WENO_PRG = []
 
 
+def get_weno_program():
+    if not _WENO_PRG:
+        parse_weno()
+    return _WENO_PRG[0]
+
+
 def parse_weno():
     fn = "WENO.F90"
 
@@ -97,25 +97,29 @@ def parse_weno():
     _WENO_PRG.append(prg)
 
 
-def get_weno_program():
-    if not _WENO_PRG:
-        parse_weno()
-    return _WENO_PRG[0]
+def with_root_kernel(prg, root_name):
+    # FIXME This is a little less beautiful than it could be
+    new_prg = prg.copy(name=root_name)
+    for name in prg:
+        clbl = new_prg[name]
+        if isinstance(clbl, lp.LoopKernel) and clbl.is_called_from_host:
+            new_prg = new_prg.with_kernel(clbl.copy(is_called_from_host=False))
 
+    new_prg = new_prg.with_kernel(prg[root_name].copy(is_called_from_host=True))
+    return new_prg
 
-def transform_compute_flux_derivative_basic(prg):
-    cfd = prg["compute_flux_derivatives"]
 
-    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+def transform_compute_flux_derivative_gpu(queue, prg):
+    prg = transform_weno_for_gpu(prg)
 
-    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-            lp.AddressSpace.GLOBAL)
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
-    return prg.with_kernel(cfd)
+    if 1:
+        with open("gen-code.cl", "w") as outf:
+            outf.write(lp.generate_code_v2(prg).device_code())
+
+    prg = lp.set_options(prg, no_numpy=True)
+    return prg
 
 
 def transform_weno_for_gpu(prg):
@@ -154,14 +158,18 @@ def transform_weno_for_gpu(prg):
     return prg
 
 
-def transform_compute_flux_derivative_gpu(queue, prg):
-    prg = transform_weno_for_gpu(prg)
+def transform_compute_flux_derivative_basic(prg):
+    cfd = prg["compute_flux_derivatives"]
 
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+
+    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
+            lp.AddressSpace.GLOBAL)
+    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
+            lp.AddressSpace.GLOBAL)
+
+    return prg.with_kernel(cfd)
 
-    if 1:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
 
-    prg = lp.set_options(prg, no_numpy=True)
-    return prg
-- 
GitLab


From 23da6d6d0ca1ad89d6cc46ddce72fad0546a4bbb Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 13:55:57 -0500
Subject: [PATCH 14/21] use empty_array utility for benchmark code, refactor it
 to have same interface as random_array utility

---
 benchmark.py |  3 +--
 test.py      | 10 +++++-----
 utilities.py |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index f65cd58..df06f97 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -39,8 +39,7 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
     metric_jacobians = random_array_on_device(queue, nx+6, ny+6, nz+6)
     print("END ARRAY GEN")
 
-    flux_derivatives_dev = cl.array.empty(queue, (nvars, ndim, nx+6, ny+6,
-        nz+6), dtype=np.float32, order="F")
+    flux_derivatives_dev = empty_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
 
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
diff --git a/test.py b/test.py
index 7d84ace..a79a1f3 100644
--- a/test.py
+++ b/test.py
@@ -40,7 +40,7 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
         def mat_bounds(self):
             return self.nvars, self.nvars
 
-        def vec_bounds(self):
+        def vec_bound(self):
             return self.nvars
 
     def setup_roe_params(nvars, ndim, direction):
@@ -51,9 +51,9 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
         return np.identity(n).astype(np.float32).copy(order="F")
 
     def kernel_roe_eigensystem(queue, prg, params, states, metrics_frozen):
-        R_dev = empty_array_on_device(queue, params.mat_bounds())
-        Rinv_dev = empty_array_on_device(queue, params.mat_bounds())
-        lam_dev = empty_array_on_device(queue, params.vec_bounds())
+        R_dev = empty_array_on_device(queue, *params.mat_bounds())
+        Rinv_dev = empty_array_on_device(queue, *params.mat_bounds())
+        lam_dev = empty_array_on_device(queue, params.vec_bound())
 
         prg = with_root_kernel(prg, "roe_eigensystem")
         prg(queue, nvars=params.nvars, ndim=params.ndim, d=params.d,
@@ -90,7 +90,7 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
 
 def test_matvec(ctx_factory):
     def kernel_mult_mat_vec(queue, prg, alpha, a, b):
-        c_dev = empty_array_on_device(queue, b.shape)
+        c_dev = empty_array_on_device(queue, *b.shape)
 
         prg = with_root_kernel(prg, "mult_mat_vec")
         prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
diff --git a/utilities.py b/utilities.py
index 2ada8c7..45b1012 100644
--- a/utilities.py
+++ b/utilities.py
@@ -15,11 +15,11 @@ def compare_arrays(a, b):
 
 
 def random_array_on_device(queue, *shape):
-    empty = empty_array_on_device(queue, shape)
+    empty = empty_array_on_device(queue, *shape)
     return cl.clrandom.fill_rand(empty)
 
 
-def empty_array_on_device(queue, shape):
+def empty_array_on_device(queue, *shape):
     return cl.array.empty(queue, shape, dtype=np.float32, order="F")
 
 
-- 
GitLab


From f925d9be89ed91a6ceb27adb59f78ecf0f3fc4f5 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 14:08:09 -0500
Subject: [PATCH 15/21] refactor out utility we don't need

---
 test.py      | 12 +++++++++++-
 utilities.py | 13 -------------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/test.py b/test.py
index a79a1f3..7283726 100644
--- a/test.py
+++ b/test.py
@@ -123,7 +123,17 @@ def test_compute_flux_derivatives(ctx_factory):
 #@pytest.mark.slow
 def test_compute_flux_derivatives_gpu(ctx_factory):
     prg = get_weno_program()
-    prg = transform_compute_flux_derivative_gpu(get_queue(ctx_factory), prg)
+    prg = transform_weno_for_gpu(prg)
+
+    queue = get_queue(ctx_factory)
+
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+
+    if 1:
+        with open("gen-code.cl", "w") as outf:
+            outf.write(lp.generate_code_v2(prg).device_code())
+
+    prg = lp.set_options(prg, no_numpy=True)
 
     lp.auto_test_vs_ref(prg, ctx_factory(),
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
diff --git a/utilities.py b/utilities.py
index 45b1012..333eb4b 100644
--- a/utilities.py
+++ b/utilities.py
@@ -109,19 +109,6 @@ def with_root_kernel(prg, root_name):
     return new_prg
 
 
-def transform_compute_flux_derivative_gpu(queue, prg):
-    prg = transform_weno_for_gpu(prg)
-
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 1:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
-    prg = lp.set_options(prg, no_numpy=True)
-    return prg
-
-
 def transform_weno_for_gpu(prg):
     prg = transform_compute_flux_derivative_basic(prg)
 
-- 
GitLab


From f76f38b09496b0f14a3f7b99fcdbc27738fa8a3a Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 14:13:41 -0500
Subject: [PATCH 16/21] add flag for printing compute_flux_derivative kernel

---
 utilities.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utilities.py b/utilities.py
index 333eb4b..f911066 100644
--- a/utilities.py
+++ b/utilities.py
@@ -109,7 +109,7 @@ def with_root_kernel(prg, root_name):
     return new_prg
 
 
-def transform_weno_for_gpu(prg):
+def transform_weno_for_gpu(prg, print_kernel=False):
     prg = transform_compute_flux_derivative_basic(prg)
 
     cfd = prg["compute_flux_derivatives"]
@@ -138,7 +138,7 @@ def transform_weno_for_gpu(prg):
     #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
     #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")
 
-    if 0:
+    if print_kernel:
         print(prg["convert_to_generalized_frozen"])
         1/0
 
-- 
GitLab


From aa931210051ec87ad13cb7d76efa495adf5c6ea5 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 15:29:55 -0500
Subject: [PATCH 17/21] refactor out a utility and move the code to the diff
 loopy block in WENO.F90

---
 WENO.F90     | 14 ++++++++++++++
 test.py      |  5 +++--
 utilities.py | 17 -----------------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/WENO.F90 b/WENO.F90
index 995d3b7..6bfad5c 100644
--- a/WENO.F90
+++ b/WENO.F90
@@ -951,6 +951,20 @@ end subroutine
 !
 ! prg = lp.parse_fortran(lp.c_preprocess(SOURCE), FILENAME)
 ! prg = lp.fix_parameters(prg, ndim=3, nvars=5, _remove=False)
+!
+! cfd = prg["compute_flux_derivatives"]
+!
+! cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
+!
+! cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
+!         lp.AddressSpace.GLOBAL)
+! cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
+!         lp.AddressSpace.GLOBAL)
+! cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
+!         lp.AddressSpace.GLOBAL)
+!
+! prg = prg.with_kernel(cfd)
+!
 ! RESULT = prg
 !
 !$loopy end
diff --git a/test.py b/test.py
index 7283726..b5db8c7 100644
--- a/test.py
+++ b/test.py
@@ -114,7 +114,9 @@ def test_matvec(ctx_factory):
 #@pytest.mark.slow
 def test_compute_flux_derivatives(ctx_factory):
     prg = get_weno_program()
-    prg = transform_compute_flux_derivative_basic(prg)
+
+    queue = get_queue(ctx_factory)
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
     lp.auto_test_vs_ref(prg, ctx_factory(),
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
@@ -126,7 +128,6 @@ def test_compute_flux_derivatives_gpu(ctx_factory):
     prg = transform_weno_for_gpu(prg)
 
     queue = get_queue(ctx_factory)
-
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
     if 1:
diff --git a/utilities.py b/utilities.py
index f911066..1f21f09 100644
--- a/utilities.py
+++ b/utilities.py
@@ -110,8 +110,6 @@ def with_root_kernel(prg, root_name):
 
 
 def transform_weno_for_gpu(prg, print_kernel=False):
-    prg = transform_compute_flux_derivative_basic(prg)
-
     cfd = prg["compute_flux_derivatives"]
 
     for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
@@ -145,18 +143,3 @@ def transform_weno_for_gpu(prg, print_kernel=False):
     return prg
 
 
-def transform_compute_flux_derivative_basic(prg):
-    cfd = prg["compute_flux_derivatives"]
-
-    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")
-
-    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
-            lp.AddressSpace.GLOBAL)
-    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
-            lp.AddressSpace.GLOBAL)
-
-    return prg.with_kernel(cfd)
-
-
-- 
GitLab


From 21995d88aae4f3c5877e36bdda2b2392fb99588c Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 15:43:16 -0500
Subject: [PATCH 18/21] refactor out a utility to write CL code

---
 benchmark.py | 22 +++++++++-------------
 test.py      | 10 ++++------
 utilities.py |  4 ++++
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index df06f97..5f48726 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -17,13 +17,19 @@ from pyopencl.tools import (  # noqa
 from utilities import *
 
 
-def benchmark_compute_flux_derivatives_gpu(ctx_factory):
+def benchmark_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
     logging.basicConfig(level="INFO")
 
     prg = get_weno_program()
     prg = transform_weno_for_gpu(prg)
 
     queue = get_queue(ctx_factory)
+    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
+    prg = lp.set_options(prg, no_numpy=True)
+    prg = lp.set_options(prg, ignore_boostable_into=True)
+    #prg = lp.set_options(prg, write_wrapper=True)
+    #op_map = lp.get_op_map(prg, count_redundant_work=False)
+    #print(op_map)
 
     ndim = 3
     nvars = 5
@@ -41,18 +47,8 @@ def benchmark_compute_flux_derivatives_gpu(ctx_factory):
 
     flux_derivatives_dev = empty_array_on_device(queue, nvars, ndim, nx+6, ny+6, nz+6)
 
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 0:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
-    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-    prg = lp.set_options(prg, ignore_boostable_into=True)
-    prg = lp.set_options(prg, no_numpy=True)
-    #prg = lp.set_options(prg, write_wrapper=True)
-    #op_map = lp.get_op_map(prg, count_redundant_work=False)
-    #print(op_map)
+    if write_code:
+        write_to_cl(prg)
 
     allocator = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
 
diff --git a/test.py b/test.py
index b5db8c7..4d2a764 100644
--- a/test.py
+++ b/test.py
@@ -123,19 +123,17 @@ def test_compute_flux_derivatives(ctx_factory):
 
 
 #@pytest.mark.slow
-def test_compute_flux_derivatives_gpu(ctx_factory):
+def test_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
     prg = get_weno_program()
     prg = transform_weno_for_gpu(prg)
 
     queue = get_queue(ctx_factory)
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
-
-    if 1:
-        with open("gen-code.cl", "w") as outf:
-            outf.write(lp.generate_code_v2(prg).device_code())
-
     prg = lp.set_options(prg, no_numpy=True)
 
+    if write_code:
+        write_to_cl(prg)
+
     lp.auto_test_vs_ref(prg, ctx_factory(),
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
diff --git a/utilities.py b/utilities.py
index 1f21f09..d68ab87 100644
--- a/utilities.py
+++ b/utilities.py
@@ -143,3 +143,7 @@ def transform_weno_for_gpu(prg, print_kernel=False):
     return prg
 
 
+def write_to_cl(prg, outfilename="gen-code.cl"):
+    with open(outfilename, "w") as outf:
+        outf.write(lp.generate_code_v2(prg).device_code())
+
-- 
GitLab


From fdbef6e721fd9c7bc264b1d6f96a9d731aa90fd3 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 16:13:15 -0500
Subject: [PATCH 19/21] using one warmup round only for auto_test_vs_ref

---
 test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test.py b/test.py
index 4d2a764..8a59001 100644
--- a/test.py
+++ b/test.py
@@ -118,7 +118,7 @@ def test_compute_flux_derivatives(ctx_factory):
     queue = get_queue(ctx_factory)
     prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))
 
-    lp.auto_test_vs_ref(prg, ctx_factory(),
+    lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1,
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
@@ -134,7 +134,7 @@ def test_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
     if write_code:
         write_to_cl(prg)
 
-    lp.auto_test_vs_ref(prg, ctx_factory(),
+    lp.auto_test_vs_ref(prg, ctx_factory(), warmup_rounds=1,
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
-- 
GitLab


From 28b9a3fe2402426323b732e23cf85d3ab21e19d0 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 17:24:05 -0500
Subject: [PATCH 20/21] refactor out kernel_mult_mat_vec

---
 test.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/test.py b/test.py
index 8a59001..81dff66 100644
--- a/test.py
+++ b/test.py
@@ -89,29 +89,24 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
 
 
 def test_matvec(ctx_factory):
-    def kernel_mult_mat_vec(queue, prg, alpha, a, b):
-        c_dev = empty_array_on_device(queue, *b.shape)
-
-        prg = with_root_kernel(prg, "mult_mat_vec")
-        prg(queue, a=a, b=b, c=c_dev, alpha=alpha)
-
-        return c_dev.get()
-
     def random_array(*shape):
         return np.random.random_sample(shape).astype(np.float32).copy(order="F")
 
-    queue = get_queue(ctx_factory)
     prg = get_weno_program()
+    queue = get_queue(ctx_factory)
 
     a = random_array(10, 10)
     b = random_array(10)
 
-    c = kernel_mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)
+    c_dev = empty_array_on_device(queue, *b.shape)
+
+    prg = with_root_kernel(prg, "mult_mat_vec")
+    prg(queue, alpha=1.0, a=a, b=b, c=c_dev)
 
-    compare_arrays(a@b, c)
+    compare_arrays(a@b, c_dev.get())
 
 
-#@pytest.mark.slow
+@pytest.mark.slow
 def test_compute_flux_derivatives(ctx_factory):
     prg = get_weno_program()
 
@@ -122,7 +117,7 @@ def test_compute_flux_derivatives(ctx_factory):
             parameters=dict(ndim=3, nvars=5, nx=16, ny=16, nz=16))
 
 
-#@pytest.mark.slow
+@pytest.mark.slow
 def test_compute_flux_derivatives_gpu(ctx_factory, write_code=False):
     prg = get_weno_program()
     prg = transform_weno_for_gpu(prg)
-- 
GitLab


From 633a80a41da443b04a98ab913b6ce89b9ca41731 Mon Sep 17 00:00:00 2001
From: "Timothy A. Smith" <tasmith4@illinois.edu>
Date: Thu, 20 Jun 2019 17:34:07 -0500
Subject: [PATCH 21/21] bugfix in random_array_on_device

---
 test.py      | 13 +++++--------
 utilities.py |  5 +++--
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/test.py b/test.py
index 81dff66..e972734 100644
--- a/test.py
+++ b/test.py
@@ -89,21 +89,18 @@ def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
 
 
 def test_matvec(ctx_factory):
-    def random_array(*shape):
-        return np.random.random_sample(shape).astype(np.float32).copy(order="F")
-
     prg = get_weno_program()
     queue = get_queue(ctx_factory)
 
-    a = random_array(10, 10)
-    b = random_array(10)
+    a = random_array_on_device(queue, 10, 10)
+    b = random_array_on_device(queue, 10)
 
-    c_dev = empty_array_on_device(queue, *b.shape)
+    c = empty_array_on_device(queue, 10)
 
     prg = with_root_kernel(prg, "mult_mat_vec")
-    prg(queue, alpha=1.0, a=a, b=b, c=c_dev)
+    prg(queue, alpha=1.0, a=a, b=b, c=c)
 
-    compare_arrays(a@b, c_dev.get())
+    compare_arrays(a.get()@b.get(), c.get())
 
 
 @pytest.mark.slow
diff --git a/utilities.py b/utilities.py
index d68ab87..306c28e 100644
--- a/utilities.py
+++ b/utilities.py
@@ -15,8 +15,9 @@ def compare_arrays(a, b):
 
 
 def random_array_on_device(queue, *shape):
-    empty = empty_array_on_device(queue, *shape)
-    return cl.clrandom.fill_rand(empty)
+    ary = empty_array_on_device(queue, *shape)
+    cl.clrandom.fill_rand(ary)
+    return ary
 
 
 def empty_array_on_device(queue, *shape):
-- 
GitLab