test_loopy.py

    i1 = lp.CInstruction("i",
            "doSomethingToGetPhi();",
            assignees="phi")

    from pymbolic.primitives import Subscript, Variable
    i2 = lp.Assignment("a",
            lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
            within_inames=frozenset(),
            within_inames_is_final=True)

    k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
            [i1, i2],
            [
                lp.GlobalArg("a", dtype=np.float32, shape=()),
                lp.ValueArg("n", dtype=np.int32),
                lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
                ],
            target=lp.CTarget(),
            )

    k = lp.preprocess_kernel(k)

    assert 'i' not in k.insn_inames("insn_0_j_update")
    print(k.stringify(with_dependencies=True))


def test_kernel_splitting(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            """
            for i
                c[i] = a[i + 1]
                ... gbarrier
                out[i] = c[i]
            end
            """, seq_dependencies=True)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))


def test_kernel_splitting_with_loop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            for i, k
                ... gbarrier
                c[k,i] = a[k, i + 1]
                ... gbarrier
                out[k,i] = c[k,i]
            end
            """, seq_dependencies=True)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))


def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
    ctx = ctx_factory()

    pytest.xfail("spilling doesn't yet use local axes")

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            for i, k
                ... gbarrier
                <> t_private_scalar = a[k,i+1]
                <> t_private_array[i % 2] = a[k,i+1]
                c[k,i] = a[k,i+1]
                ... gbarrier
                out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
            end
            """, seq_dependencies=True)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
    knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
    knl = lp.set_temporary_scope(knl, "t_private_array", "private")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))


def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            for i, k
                ... gbarrier
                <> t_local[i % 8,k] = i % 8
                c[k,i] = a[k,i+1]
                ... gbarrier
                out[k,i] = c[k,i] + t_local[i % 8,k]
            end
            """, seq_dependencies=True)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    knl = lp.set_temporary_scope(knl, "t_local", "local")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))


def test_global_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n}",
            """
            for i
                <> c[i] = a[i + 1]
                ... gbarrier
                out[i] = c[i]
            end
            """, seq_dependencies=True)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
    knl = lp.set_temporary_scope(knl, "c", "global")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    #print(cgr.device_code())
    #print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))


def test_assign_to_linear_subscript(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl1 = lp.make_kernel(
            "{ [i]: 0<=i<n}",
            "a[i,i] = 1")
    knl2 = lp.make_kernel(
            "{ [i]: 0<=i<n}",
            "a[[i*n + i]] = 1",
            [lp.GlobalArg("a", shape="n,n"), "..."])

    a1 = cl.array.zeros(queue, (10, 10), np.float32)
    knl1(queue, a=a1)
    a2 = cl.array.zeros(queue, (10, 10), np.float32)
    knl2(queue, a=a2)

    assert np.array_equal(a1.get(),  a2.get())


def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel(
        "{[i]: 1<=i<=n}",
        "out[i] = -(f[i+1] - f[i-1])/h",
        [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel(
        "{[j]: 1<=j<=n}",
        "f[j] = u[j]**2/2",
        [
            lp.GlobalArg("f", shape="n+2"),
            lp.GlobalArg("u", shape="n+2"),
            ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
            data_flow=[
                ("f", 1, 0)
                ])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(
            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")

    precomp_knl = lp.precompute(
            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)


# {{{ call without returned values

def test_call_with_no_returned_value(ctx_factory):
    import pymbolic.primitives as p

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{:}",
        [lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
        )

    from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
    knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
    knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])

    evt, _ = knl(queue)

# }}}


def test_unschedulable_kernel_detection():
    knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
                         """
                         mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
                         mat2[j] = mat2[j] + 1 {inames=j, id=i2}
                         mat3[i] = mat3[i] + 1 {inames=i, id=i3}
                         """)

    knl = lp.preprocess_kernel(knl)

    # Check that loopy can detect the unschedulability of the kernel
    assert not lp.has_schedulable_iname_nesting(knl)
    assert len(list(lp.get_iname_duplication_options(knl))) == 4

    for inames, insns in lp.get_iname_duplication_options(knl):
        fixed_knl = lp.duplicate_inames(knl, inames, insns)
        assert lp.has_schedulable_iname_nesting(fixed_knl)

    knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
                         """
                         mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
                         mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
                         mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
                         mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
                         """)

    assert not lp.has_schedulable_iname_nesting(knl)
    assert len(list(lp.get_iname_duplication_options(knl))) == 10


def test_regression_no_ret_call_removal(ctx_factory):
    # https://github.com/inducer/loopy/issues/32
    knl = lp.make_kernel(
            "{[i] : 0<=i<n}",
            "f(sum(i, x[i]))")
    knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
    knl = lp.preprocess_kernel(knl)
    assert len(knl.instructions) == 3


def test_regression_persistent_hash():
    knl1 = lp.make_kernel(
            "{[i] : 0<=i<n}",
            "cse_exprvar = d[2]*d[2]")

    knl2 = lp.make_kernel(
            "{[i] : 0<=i<n}",
            "cse_exprvar = d[0]*d[0]")
    from loopy.tools import LoopyKeyBuilder
    lkb = LoopyKeyBuilder()
    assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
    assert lkb(knl1) != lkb(knl2)


def test_sequential_dependencies(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            """
            for i
                <> aa = 5jf
                <> bb = 5j
                a[i] = imag(aa)
                b[i] = imag(bb)
                c[i] = 5f
            end
            """, seq_dependencies=True)

    print(knl.stringify(with_dependencies=True))

    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))


def test_nop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
            """
            for itrip,i
                <> z[i] = z[i+1] + z[i]  {id=wr_z}
                <> v[i] = 11  {id=wr_v}
                ... nop {dep=wr_z:wr_v,id=yoink}
                z[i] = z[i] - z[i+1] + v[i]
            end
            """)

    print(knl)

    knl = lp.fix_parameters(knl, n=15)
    knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})

    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))


def test_global_barrier(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
            """
            for i
                for itrip
                    ... gbarrier {id=top}
                    <> z[i] = z[i+1] + z[i]  {id=wr_z,dep=top}
                    <> v[i] = 11  {id=wr_v,dep=top}
                    ... gbarrier {dep=wr_z:wr_v,id=yoink}
                    z[i] = z[i] - z[i+1] + v[i] {id=iupd}
                end
                ... gbarrier {dep=iupd,id=postloop}
                z[i] = z[i] - z[i+1] + v[i]  {dep=postloop}
            end
            """)

    knl = lp.fix_parameters(knl, ntrips=3)
    knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})

    ref_knl = knl
    ref_knl = lp.set_temporary_scope(ref_knl, "z", "global")
    ref_knl = lp.set_temporary_scope(ref_knl, "v", "global")

    knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
    print(knl)

    knl = lp.preprocess_kernel(knl)
    assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL
    assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL

    print(knl)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10))


def test_missing_global_barrier():
    knl = lp.make_kernel(
            "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
            """
            for i
                for itrip
                    ... gbarrier {id=yoink}
                    <> z[i] = z[i] - z[i+1]  {id=iupd,dep=yoink}
                end
                # This is where the barrier should be
                z[i] = z[i] - z[i+1] + v[i]  {dep=iupd}
            end
            """)

    knl = lp.set_temporary_scope(knl, "z", "global")
    knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
    knl = lp.preprocess_kernel(knl)

    from loopy.diagnostic import MissingBarrierError
    with pytest.raises(MissingBarrierError):
        lp.get_one_scheduled_kernel(knl)


def test_index_cse(ctx_factory):
    knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
                         """
                         for i
                            for j
                                c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l])
                            end
                         end
                         """)
    knl = lp.tag_inames(knl, "l:unr")
    knl = lp.prioritize_loops(knl, "i,j,k,l")
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
    knl = lp.fix_parameters(knl, n=5)
    print(lp.generate_code_v2(knl).device_code())


def test_ilp_and_conditionals(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel('{[k]: 0<=k<n}}',
         """
         for k
             <> Tcond = T[k] < 0.5
             if Tcond
                 cp[k] = 2 * T[k] + Tcond
             end
         end
         """)

    knl = lp.fix_parameters(knl, n=200)
    knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, 'k', 2, inner_tag='ilp')

    lp.auto_test_vs_ref(ref_knl, ctx, knl)


def test_unr_and_conditionals(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel('{[k]: 0<=k<n}}',
         """
         for k
             <> Tcond[k] = T[k] < 0.5
             if Tcond[k]
                 cp[k] = 2 * T[k] + Tcond[k]
             end
         end
         """)

    knl = lp.fix_parameters(knl, n=200)
    knl = lp.add_and_infer_dtypes(knl, {"T": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, 'k', 2, inner_tag='unr')

    lp.auto_test_vs_ref(ref_knl, ctx, knl)


def test_constant_array_args(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel('{[k]: 0<=k<n}}',
         """
         for k
             <> Tcond[k] = T[k] < 0.5
             if Tcond[k]
                 cp[k] = 2 * T[k] + Tcond[k]
             end
         end
         """,
         [lp.ConstantArg('T', shape=(200,), dtype=np.float32),
         '...'])

    knl = lp.fix_parameters(knl, n=200)

    lp.auto_test_vs_ref(knl, ctx, knl)


@pytest.mark.parametrize("src_order", ["C"])
@pytest.mark.parametrize("tmp_order", ["C", "F"])
def test_temp_initializer(ctx_factory, src_order, tmp_order):
    a = np.random.randn(3, 3).copy(order=src_order)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<n}",
            "out[i,j] = tmp[i,j]",
            [
                lp.TemporaryVariable("tmp",
                    initializer=a,
                    shape=lp.auto,
                    scope=lp.temp_var_scope.PRIVATE,
                    order=tmp_order),
                "..."
                ])

    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
    knl = lp.fix_parameters(knl, n=a.shape[0])

    evt, (a2,) = knl(queue, out_host=True)

    assert np.array_equal(a, a2)


def test_header_extract():
    knl = lp.make_kernel('{[k]: 0<=k<n}}',
         """
         for k
             T[k] = k**2
         end
         """,
         [lp.GlobalArg('T', shape=(200,), dtype=np.float32),
         '...'])

    knl = lp.fix_parameters(knl, n=200)

    #test C
    cknl = knl.copy(target=lp.CTarget())
    assert str(lp.generate_header(cknl)[0]) == (
            'void loopy_kernel(float *__restrict__ T);')

    #test CUDA
    cuknl = knl.copy(target=lp.CudaTarget())
    assert str(lp.generate_header(cuknl)[0]) == (
            'extern "C" __global__ void __launch_bounds__(1) '
            'loopy_kernel(float *__restrict__ T);')

    #test OpenCL
    oclknl = knl.copy(target=lp.PyOpenCLTarget())
    assert str(lp.generate_header(oclknl)[0]) == (
            '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
            'loopy_kernel(__global float *__restrict__ T);')


def test_scalars_with_base_storage(ctx_factory):
    """ Regression test for !50 """
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{ [i]: 0<=i<1}",
            "a = 1",
            [lp.TemporaryVariable("a", dtype=np.float64,
                                  shape=(), base_storage="base")])

    knl(queue, out_host=True)


def test_if_else(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{ [i]: 0<=i<50}",
            """
            if i % 3 == 0
                a[i] = 15
            elif i % 3 == 1
                a[i] = 11
            else
                a[i] = 3
            end
            """
            )

    evt, (out,) = knl(queue, out_host=True)

    out_ref = np.empty(50)
    out_ref[::3] = 15
    out_ref[1::3] = 11
    out_ref[2::3] = 3

    assert np.array_equal(out_ref, out)


def test_tight_loop_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        ["{ [i] : 0 <= i <= 5 }",
         "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"],
        """
        for i
          for j
            out[j] = j
          end
        end
        """,
        silenced_warnings="write_race(insn)")

    knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")

    evt, (out,) = knl(queue, out_host=True)

    assert (out == np.arange(10)).all()


def test_tight_loop_bounds_codegen():
    knl = lp.make_kernel(
        ["{ [i] : 0 <= i <= 5 }",
         "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"],
        """
        for i
          for j
            out[j] = j
          end
        end
        """,
        silenced_warnings="write_race(insn)",
        target=lp.OpenCLTarget())

    knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0")

    cgr = lp.generate_code_v2(knl)
    #print(cgr.device_code())

    for_loop = \
        "for (int j = " \
        "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \
        "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)"

    assert for_loop in cgr.device_code()


if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
        from py.test.cmdline import main
        main([__file__])

# vim: foldmethod=marker