test_loopy.py

                a = 1.5 + sum((i,j), i*j)
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """,
            [
                lp.GlobalArg("a", shape=lp.auto),
                lp.GlobalArg("b", shape=lp.auto),
                lp.GlobalArg("c", shape=lp.auto),
                lp.ValueArg("n"),
                ],
            assumptions="n>=1")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()


def test_arg_guessing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n }",
            ],
            """
                a = 1.5 + sum((i,j), i*j)
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """,
            assumptions="n>=1")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()


def test_arg_guessing_with_reduction(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n }",
            ],
            """
                a = 1.5 + sum((i,j), i*j)
                d = 1.5 + sum((i,j), b[i,j])
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """,
            assumptions="n>=1")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()

# }}}


def test_nonlinear_index(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n }",
            ],
            """
                a[i*i] = 17
                """,
            [
                lp.GlobalArg("a", shape="n"),
                lp.ValueArg("n"),
                ],
            assumptions="n>=1")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()


def test_triangle_domain(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n and i <= j}",
            ],
            "a[i,j] = 17",
            assumptions="n>=1")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()


def test_offsets_and_slicing(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 20

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i<n and 0<=j<m }",
            ],
            """
                b[i,j] = 2*a[i,j]
                """,
            assumptions="n>=1 and m>=1",
            default_offset=lp.auto)

    knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1")

    cknl = lp.CompiledKernel(ctx, knl)

    a_full = cl.clrandom.rand(queue, (n, n), np.float64)
    a_full_h = a_full.get()
    b_full = cl.clrandom.rand(queue, (n, n), np.float64)
    b_full_h = b_full.get()

    a_sub = (slice(3, 10), slice(5, 10))
    a = a_full[a_sub]

    b_sub = (slice(3+3, 10+3), slice(5+4, 10+4))
    b = b_full[b_sub]

    b_full_h[b_sub] = 2*a_full_h[a_sub]

    print cknl.get_highlighted_code({"a": a.dtype})
    cknl(queue, a=a, b=b)

    import numpy.linalg as la
    assert la.norm(b_full.get() - b_full_h) < 1e-13


def test_vector_ilp_with_prefetch(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0],
            "{ [i]: 0<=i<n }",
            "out[i] = 2*a[i]",
            [
                # Tests that comma'd arguments interoperate with
                # argument guessing.
                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                "..."
                ])

    knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

    cknl = lp.CompiledKernel(ctx, knl)
    cknl.cl_kernel_info()

    import re
    code = cknl.get_code()
    assert len(list(re.finditer("barrier", code))) == 1


def test_convolution(ctx_factory):
    ctx = ctx_factory()

    dtype = np.float64

    knl = lp.make_kernel(ctx.devices[0],
        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
                }",
        """
        out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \
            img[iimg, im_x-f_x, im_y-f_y, icolor] \
            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
        """,
        [
            lp.GlobalArg("f", dtype, shape=lp.auto),
            lp.GlobalArg("img", dtype, shape=lp.auto),
            lp.GlobalArg("out", dtype, shape=lp.auto),
            "..."
            ],
        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1",
        defines=dict(ncolors=3))

    ref_knl = knl

    def variant_0(knl):
        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
        return knl

    def variant_1(knl):
        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        return knl

    for variant in [
            variant_0,
            #variant_1
            ]:
        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
                parameters=dict(
                    im_w=128, im_h=128, f_w=7,
                    nfeats=12, nimgs=17
                    ))


def test_c_instruction(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n }",
            ],
            [
                lp.CInstruction("i", """
                    x = sin((float) i);
                    """, assignees="x"),
                "a[i*i] = x",
                ],
            [
                lp.GlobalArg("a", shape="n"),
                lp.ValueArg("n"),
                lp.TemporaryVariable("x", np.float32),
                ],
            assumptions="n>=1")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()


def test_dependent_domain_insn_iname_finding(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
            "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
            ],
            """
                <> src_ibox = source_boxes[isrc_box]
                <> isrc_start = box_source_starts[src_ibox]
                <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
                <> strength = strengths[isrc] {id=set_strength}
                """,
            [
                lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
                    None, shape=None),
                lp.GlobalArg("strengths",
                    None, shape="nsources"),
                "..."])

    print knl
    assert "isrc_box" in knl.insn_inames("set_strength")

    print lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                source_boxes=np.int32,
                box_source_starts=np.int32,
                box_source_counts_nonchild=np.int32,
                strengths=np.float64,
                nsources=np.int32,
                ))


def test_inames_deps_from_write_subscript(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i,j<n}",
            ],
            """
                <> src_ibox = source_boxes[i]
                <int32> something = 5
                a[src_ibox] = sum(j, something) {id=myred}
                """,
            [
                lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                    None, shape=None),
                "..."])

    print knl
    assert "i" in knl.insn_inames("myred")


def test_split_reduction(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j,k]: 0<=i,j,k<n}",
            ],
            """
                b = sum((i,j,k), a[i,j,k])
                """,
            [
                lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                    None, shape=None),
                "..."])

    knl = lp.split_reduction_outward(knl, "j,k")
    # FIXME: finish test


def test_modulo_indexing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(ctx.devices[0], [
            "{[i,j]: 0<=i<n and 0<=j<5}",
            ],
            """
                b[i] = sum(j, a[(i+j)%n])
                """,
            [
                lp.GlobalArg("a", None, shape="n"),
                "..."
                ]
            )

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                a=np.float32,
                ))


def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(ctx.devices[0],
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
                <> xi = qpts[el, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init,inames=i2:el}

                <> w = s**(deg-alpha1) {id=init_w}

                tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                        {id=write_tmp,inames=alpha2}
                w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                        {id=update_w,dep=init_w:write_tmp}
                aind = aind + 1 \
                        {id=aind_incr,\
                        dep=aind_init:write_tmp:update_w, \
                        inames=el:i2:alpha1:alpha2}
                """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1"
            )

    knl = lp.fix_parameter(knl, "nqp1d", 7)
    knl = lp.fix_parameter(knl, "deg", 4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    print knl
    print lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                ))


@pytest.mark.parametrize("vec_len", [2, 3, 4, 8])
def test_vector_types(ctx_factory, vec_len):
    ctx = cl.create_some_context()

    knl = lp.make_kernel(ctx.devices[0],
            "{ [i,j]: 0<=i<n and 0<=j<vec_len }",
            "out[i,j] = 2*a[i,j]",
            [
                lp.GlobalArg("a", np.float32, shape=lp.auto),
                lp.GlobalArg("out", np.float32, shape=lp.auto),
                "..."
                ])

    knl = lp.fix_parameter(knl, "vec_len", vec_len)

    ref_knl = knl

    knl = lp.tag_data_axes(knl, "out", "c,vec")
    knl = lp.tag_inames(knl, dict(j="unr"))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters=dict(
                n=20000
                ),
            fills_entire_output=False)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
        from py.test.cmdline import main
        main([__file__])

# vim: foldmethod=marker