import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array  # noqa
import pyopencl.tools  # noqa
import pyopencl.clrandom  # noqa
import loopy as lp  # noqa

import sys
import logging

import pytest
from pytest import approx
import pyopencl as cl
from pyopencl.tools import (  # noqa
        pytest_generate_tests_for_pyopencl
        as pytest_generate_tests)

import device_fixtures as device
import program_fixtures as program
import setup_fixtures as setup
import kernel_fixtures as kernel


def compare_arrays(a, b):
    assert a == approx(b)


def compare_roe_identity(states, R, Rinv):
    dState = states[:,1] - states[:,0]
    compare_arrays(R@(Rinv@dState), dState)


def compare_roe_property(states, fluxes, R, Rinv, lam):
    dState = states[:,1] - states[:,0]
    dFlux = fluxes[:,1] - fluxes[:,0]

    temp = Rinv@dState
    temp = np.multiply(lam, temp)
    compare_arrays(R@temp, dFlux)


def transform_compute_flux_derivative_basic(prg):
    cfd = prg["compute_flux_derivatives"]

    cfd = lp.assume(cfd, "nx > 0 and ny > 0 and nz > 0")

    cfd = lp.set_temporary_scope(cfd, "flux_derivatives_generalized",
            lp.AddressSpace.GLOBAL)
    cfd = lp.set_temporary_scope(cfd, "generalized_fluxes",
            lp.AddressSpace.GLOBAL)
    cfd = lp.set_temporary_scope(cfd, "weno_flux_tmp",
            lp.AddressSpace.GLOBAL)

    return prg.with_kernel(cfd)


def transform_weno_for_gpu(prg):
    prg = transform_compute_flux_derivative_basic(prg)

    cfd = prg["compute_flux_derivatives"]

    for suffix in ["", "_1", "_2", "_3", "_4", "_5", "_6", "_7"]:
        cfd = lp.split_iname(cfd, "i"+suffix, 16,
                outer_tag="g.0", inner_tag="l.0")
        cfd = lp.split_iname(cfd, "j"+suffix, 16,
                outer_tag="g.1", inner_tag="l.1")

    for var_name in ["delta_xi", "delta_eta", "delta_zeta"]:
        cfd = lp.assignment_to_subst(cfd, var_name)

    cfd = lp.add_barrier(cfd, "tag:to_generalized", "tag:flux_x_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_x_compute", "tag:flux_x_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_x_diff", "tag:flux_y_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_y_compute", "tag:flux_y_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_y_diff", "tag:flux_z_compute")
    cfd = lp.add_barrier(cfd, "tag:flux_z_compute", "tag:flux_z_diff")
    cfd = lp.add_barrier(cfd, "tag:flux_z_diff", "tag:from_generalized")

    prg = prg.with_kernel(cfd)

    # FIXME: These should work, but don't
    # FIXME: Undo the hand-inlining in WENO.F90
    #prg = lp.inline_callable_kernel(prg, "convert_to_generalized")
    #prg = lp.inline_callable_kernel(prg, "convert_from_generalized")

    if 0:
        print(prg["convert_to_generalized_frozen"])
        1/0

    return prg


def transform_compute_flux_derivative_gpu(queue, prg):
    prg = transform_weno_for_gpu(prg)

    prg = prg.copy(target=lp.PyOpenCLTarget(queue.device))

    if 1:
        with open("gen-code.cl", "w") as outf:
            outf.write(lp.generate_code_v2(prg).device_code())

    prg = lp.set_options(prg, no_numpy=True)
    return prg


@pytest.mark.xfail
@pytest.mark.parametrize("states_str,fluxes_str,direction", [
    ("2 1,4 1,4 1,4 1,20 5.5", "4 1,11.2 2.6,8 1,8 1,46.4 7.1", "x"),
    ("2 1,4 1,4 1,4 1,20 5.5", "4 1,8 1,11.2 2.6,8 1,46.4 7.1", "y"),
    ("2 1,4 1,4 1,4 1,20 5.5", "4 1,8 1,8 1,11.2 2.6,46.4 7.1", "z"),
    ("1 2,-1 -4,-1 -4,-1 -4,5.5 20", "-1 -4,2.6 11.2,1 8,1 8,-7.1 -46.4", "x"),
    ("1 2,-1 -4,-1 -4,-1 -4,5.5 20", "-1 -4,1 8,2.6 11.2,1 8,-7.1 -46.4", "y"),
    ("1 2,-1 -4,-1 -4,-1 -4,5.5 20", "-1 -4,1 8,1 8,2.6 11.2,-7.1 -46.4", "z"),
    ("2 1,4 1,8 2,12 3,64 11", "4 1,11.2 2.6,16 2,24 3,134.4 12.6", "x"),
    ("2 1,4 1,8 2,12 3,64 11", "8 2,16 2,35.2 5.6,48 6,268.8 25.2", "y"),
    ("2 1,4 1,8 2,12 3,64 11", "12 3,24 3,48 6,75.2 10.6,403.2 37.8", "z")
    ])
def test_roe_uniform_grid(ctx_factory, states_str, fluxes_str, direction):
    queue = device.get_queue(ctx_factory)
    prg = program.get_weno()

    params = setup.roe_params(nvars=5, ndim=3, direction=direction)
    states = setup.array_from_string(states_str)
    metrics_frozen = setup.identity(params.ndim)
    R, Rinv, lam = kernel.roe_eigensystem(queue, prg, params, states, metrics_frozen)

    compare_roe_identity(states, R, Rinv)

    fluxes = setup.array_from_string(fluxes_str)
    compare_roe_property(states, fluxes, R, Rinv, lam)


def test_matvec(ctx_factory):
    queue = device.get_queue(ctx_factory)
    prg = program.get_weno()

    a = setup.random_array(10, 10)
    b = setup.random_array(10)

    c = kernel.mult_mat_vec(queue, prg, alpha=1.0, a=a, b=b)

    compare_arrays(a@b, c)


#@pytest.mark.slow
def test_compute_flux_derivatives(ctx_factory):
    queue = device.get_queue(ctx_factory)
    prg = program.get_weno()
    prg = transform_compute_flux_derivative_basic(prg)

    params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
    arrays = setup.random_flux_derivative_arrays(params)

    kernel.compute_flux_derivatives(queue, prg, params, arrays)


#@pytest.mark.slow
def test_compute_flux_derivatives_gpu(ctx_factory):
    queue = device.get_queue(ctx_factory)
    prg = program.get_weno()
    prg = transform_compute_flux_derivative_gpu(queue, prg)

    params = setup.flux_derivative_params(ndim=3, nvars=5, n=10)
    arrays = setup.random_flux_derivative_arrays_on_device(ctx_factory, params)

    kernel.compute_flux_derivatives(queue, prg, params, arrays)


# This lets you run 'python test.py test_case(cl._csc)' without pytest.
if __name__ == "__main__":
    if len(sys.argv) > 1:
        logging.basicConfig(level="INFO")
        exec(sys.argv[1])
    else:
        pytest.main([__file__])