diff --git a/test/test_apps.py b/test/test_apps.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3d743c4bdf49c6424e962a08bab0c396f8add7 --- /dev/null +++ b/test/test_apps.py @@ -0,0 +1,401 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# {{{ convolutions + +def test_convolution(ctx_factory): + ctx = ctx_factory() + + dtype = np.float32 + + knl = lp.make_kernel( + "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ + -f_w <= f_x,f_y <= f_w \ + and 0 <= im_x < im_w and 0 <= im_y < im_h \ + and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ + }", + """ + out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ + img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ + * f[ifeat, f_w+f_x, f_w+f_y, icolor]) + """, + [ + lp.GlobalArg("f", dtype, shape=lp.auto), + lp.GlobalArg("img", dtype, shape=lp.auto), + lp.GlobalArg("out", dtype, shape=lp.auto), + "..." + ], + assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", + flags="annotate_inames", + defines=dict(ncolors=3)) + + f_w = 3 + + knl = lp.fix_parameters(knl, f_w=f_w) + + ref_knl = knl + + def variant_0(knl): + #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") + return knl + + def variant_1(knl): + knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") + return knl + + def variant_2(knl): + knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.tag_inames(knl, dict(ifeat="g.2")) + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") + knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") + return knl + + for variant in [ + variant_0, + variant_1, + variant_2 + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + parameters=dict( + im_w=128, im_h=128, f_w=f_w, + nfeats=3, nimgs=3 + )) + + +def test_convolution_with_nonzero_base(ctx_factory): + # This is kept alive as a test for domains that don't start at zero. + # These are a bad idea for split_iname, which places its origin at zero + # and therefore produces a first block that is odd-sized. + # + # Therefore, for real tests, check test_convolution further up. + + ctx = ctx_factory() + + dtype = np.float32 + + knl = lp.make_kernel( + "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ + -f_w <= f_x,f_y <= f_w \ + and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ + and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ + }", + """ + out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \ + img[iimg, im_x-f_x, im_y-f_y, icolor] \ + * f[ifeat, f_w+f_x, f_w+f_y, icolor]) + """, + [ + lp.GlobalArg("f", dtype, shape=lp.auto), + lp.GlobalArg("img", dtype, shape=lp.auto), + lp.GlobalArg("out", dtype, shape=lp.auto), + "..." + ], + assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", + flags="annotate_inames", + defines=dict(ncolors=3)) + + ref_knl = knl + + f_w = 3 + + def variant_0(knl): + #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") + return knl + + def variant_1(knl): + knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") + return knl + + for variant in [ + variant_0, + variant_1, + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + parameters=dict( + im_w=128, im_h=128, f_w=f_w, + nfeats=12, nimgs=17 + )) + +# }}} + + +def test_rob_stroud_bernstein(ctx_factory): + ctx = ctx_factory() + + # NOTE: tmp would have to be zero-filled beforehand + + knl = lp.make_kernel( + "{[el, i2, alpha1,alpha2]: \ + 0 <= el < nels and \ + 0 <= i2 < nqp1d and \ + 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", + """ + <> xi = qpts[1, i2] {inames=+el} + <> s = 1-xi + <> r = xi/s + <> aind = 0 {id=aind_init,inames=+i2:el} + + <> w = s**(deg-alpha1) {id=init_w} + + tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ + {id=write_tmp,inames=+alpha2} + w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ + {id=update_w,dep=init_w:write_tmp} + aind = aind + 1 \ + {id=aind_incr,\ + dep=aind_init:write_tmp:update_w, \ + inames=+el:i2:alpha1:alpha2} + """, + [ + # Must declare coeffs to have "no" shape, to keep loopy + # from trying to figure it out the shape automatically. + + lp.GlobalArg("coeffs", None, shape=None), + "..." + ], + assumptions="deg>=0 and nels>=1" + ) + + knl = lp.fix_parameters(knl, nqp1d=7, deg=4) + knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") + knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", + slabs=(0, 1)) + knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) + + print(lp.CompiledKernel(ctx, knl).get_highlighted_code( + dict( + qpts=np.float32, + coeffs=np.float32, + tmp=np.float32, + ))) + + +def test_rob_stroud_bernstein_full(ctx_factory): + #logging.basicConfig(level=logging.DEBUG) + ctx = ctx_factory() + + # NOTE: result would have to be zero-filled beforehand + + knl = lp.make_kernel( + "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ + 0 <= el < nels and \ + 0 <= i2 < nqp1d and \ + 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ + \ + 0 <= i1_2 < nqp1d and \ + 0 <= alpha1_2 <= deg and \ + 0 <= i2_2 < nqp1d \ + }", + """ + <> xi = qpts[1, i2] {inames=+el} + <> s = 1-xi + <> r = xi/s + <> aind = 0 {id=aind_init,inames=+i2:el} + + <> w = s**(deg-alpha1) {id=init_w} + + <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ + {id=write_tmp,inames=+alpha2} + w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ + {id=update_w,dep=init_w:write_tmp} + aind = aind + 1 \ + {id=aind_incr,\ + dep=aind_init:write_tmp:update_w, \ + inames=+el:i2:alpha1:alpha2} + + <> xi2 = qpts[0, i1_2] {dep=aind_incr,inames=+el} + <> s2 = 1-xi2 + <> r2 = xi2/s2 + <> w2 = s2**deg + + result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ + w2 * tmp[alpha1_2, i2_2] \ + {inames=el:alpha1_2:i1_2:i2_2} + + w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) + """, + [ + # Must declare coeffs to have "no" shape, to keep loopy + # from trying to figure it out the shape automatically. + + lp.GlobalArg("coeffs", None, shape=None), + "..." + ], + assumptions="deg>=0 and nels>=1" + ) + + knl = lp.fix_parameters(knl, nqp1d=7, deg=4) + + if 0: + knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") + knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", + slabs=(0, 1)) + knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) + + from pickle import dumps, loads + knl = loads(dumps(knl)) + + knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + dict( + qpts=np.float32, + tmp=np.float32, + coeffs=np.float32, + result=np.float32, + )) + print(knl) + + +def test_stencil(ctx_factory): + ctx = ctx_factory() + + # n=32 causes corner case behavior in size calculations for temprorary (a + # non-unifiable, two-constant-segments PwAff as the base index) + + n = 256 + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < %d}" % n, + [ + "a_offset(ii, jj) := a[ii+1, jj+1]", + "z[i,j] = -2*a_offset(i,j)" + " + a_offset(i,j-1)" + " + a_offset(i,j+1)" + " + a_offset(i-1,j)" + " + a_offset(i+1,j)" + ], + [ + lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)), + lp.GlobalArg("z", np.float32, shape=(n+2, n+2,)) + ]) + + ref_knl = knl + + def variant_1(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + def variant_2(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + for variant in [ + #variant_1, + variant_2, + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + print_ref_code=False, + op_count=[n*n], op_label=["cells"]) + + +def test_stencil_with_overfetch(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < n}", + [ + "a_offset(ii, jj) := a[ii+2, jj+2]", + "z[i,j] = -2*a_offset(i,j)" + " + a_offset(i,j-1)" + " + a_offset(i,j+1)" + " + a_offset(i-1,j)" + " + a_offset(i+1,j)" + + " + a_offset(i,j-2)" + " + a_offset(i,j+2)" + " + a_offset(i-2,j)" + " + a_offset(i+2,j)" + ], + assumptions="n>=1") + + if ctx.devices[0].platform.name == "Portable Computing Language": + # https://github.com/pocl/pocl/issues/205 + pytest.skip("takes very long to compile on pocl") + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + def variant_overfetch(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", + slabs=(1, 1)) + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", + slabs=(1, 1)) + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + for variant in [variant_overfetch]: + n = 200 + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + print_ref_code=False, + op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_domain.py b/test/test_domain.py new file mode 100644 index 0000000000000000000000000000000000000000..0631a08796eeda73376afcd1cd03a737a33241f6 --- /dev/null +++ b/test/test_domain.py @@ -0,0 +1,375 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +from six.moves import range + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_assume(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "a[i] = a[i] + 1", + [lp.GlobalArg("a", np.float32, shape="n"), "..."]) + + knl = lp.split_iname(knl, "i", 16) + knl = lp.set_loop_priority(knl, "i_outer,i_inner") + knl = lp.assume(knl, "n mod 16 = 0") + knl = lp.assume(knl, "n > 10") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + kernel_gen = lp.generate_loop_schedules(knl) + + for gen_knl in kernel_gen: + print(gen_knl) + compiled = lp.CompiledKernel(ctx, gen_knl) + print(compiled.get_code()) + assert "if" not in compiled.get_code() + + +def test_divisibility_assumption(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "[n] -> {[i]: 0<=i<n}", + [ + "b[i] = 2*a[i]" + ], + [ + lp.GlobalArg("a", np.float32, shape=("n",)), + lp.GlobalArg("b", np.float32, shape=("n",)), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and (exists zz: n = 16*zz)") + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + code = lp.generate_code(k) + assert "if" not in code + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters={"n": 16**3}) + + +def test_eq_constraint(ctx_factory): + logging.basicConfig(level=logging.INFO) + + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < 32}", + [ + "a[i] = b[i]" + ], + [ + lp.GlobalArg("a", np.float32, shape=(1000,)), + lp.GlobalArg("b", np.float32, shape=(1000,)) + ]) + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") + knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + kernel_gen = lp.generate_loop_schedules(knl) + + for knl in kernel_gen: + print(lp.generate_code(knl)) + + +def test_dependent_loop_bounds(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", + "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), + lp.GlobalArg("a_indices", np.int32, shape=lp.auto), + lp.GlobalArg("a_values", dtype), + lp.GlobalArg("a_sum", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + +def test_dependent_loop_bounds_2(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_start = a_rowstarts[i]", + "<> row_len = a_rowstarts[i+1] - row_start", + "ax[i] = sum(jj, a_values[[row_start+jj]])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), + lp.GlobalArg("a_indices", np.int32, shape=lp.auto), + lp.GlobalArg("a_values", dtype, strides=(1,)), + lp.GlobalArg("ax", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + +def test_dependent_loop_bounds_3(ctx_factory): + # The point of this test is that it shows a dependency between + # domains that is exclusively mediated by the row_len temporary. + # It also makes sure that row_len gets read before any + # conditionals use it. + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_len = a_row_lengths[i]", + "a[i,jj] = 1", + ], + [ + lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), + lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), + lp.ValueArg("n", np.int32), + ]) + + assert knl.parents_per_domain()[1] == 0 + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", + inner_tag="l.1") + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + import pytest + with pytest.raises(RuntimeError): + list(lp.generate_loop_schedules(knl_bad)) + + +def test_independent_multi_domain(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[j]: 0<=j<n}", + ], + [ + "a[i] = 1", + "b[j] = 2", + ], + [ + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.GlobalArg("b", dtype, shape=("n"), order="C"), + lp.ValueArg("n", np.int32), + ]) + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", + inner_tag="l.0") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", + inner_tag="l.0") + assert knl.parents_per_domain() == 2*[None] + + n = 50 + cknl = lp.CompiledKernel(ctx, knl) + evt, (a, b) = cknl(queue, n=n, out_host=True) + + assert a.shape == (50,) + assert b.shape == (50,) + assert (a == 1).all() + assert (b == 2).all() + + +def test_equality_constraints(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "[n] -> {[i,j]: 0<=i,j<n }", + "{[k]: k =i+5 and k < n}", + ], + [ + "a[i,j] = 5 {id=set_all}", + "b[i,k] = 22 {dep=set_all}", + ], + [ + lp.GlobalArg("a,b", dtype, shape="n, n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + name="equality_constraints", assumptions="n>=1") + + seq_knl = knl + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + #print(knl) + #print(knl.domains[0].detect_equalities()) + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n), print_ref_code=True) + + +def test_stride(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "{[i]: 0<=i<n and (exists l: i = 2*l)}", + ], + [ + "a[i] = 5", + ], + [ + lp.GlobalArg("a", dtype, shape="n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + assumptions="n>=1") + + seq_knl = knl + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n)) + + +def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "{[i]: 0<=i<n }", + "{[k]: k=i and (exists l: k = 2*l) }", + ], + [ + "a[i] = 5 {id=set}", + "b[k] = 6 {dep=set}", + ], + [ + lp.GlobalArg("a,b", dtype, shape="n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + assumptions="n>=1") + + seq_knl = knl + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n)) + + +def test_triangle_domain(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n and i <= j}", + "a[i,j] = 17", + assumptions="n>=1") + + print(knl) + print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index f0718fbab68450227c377afcaf383cab5db5cacb..3cf953ce41caa75b856c0c54fe23ed9e10afcd2a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -79,23 +79,6 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many -def test_extract_subst(ctx_factory): - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - """ - a[i] = 23*b[i]**2 + 25*b[i]**2 - """) - - knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha") - - print(knl) - - from loopy.symbolic import parse - - insn, = knl.instructions - assert insn.expression == parse("bsquare(23) + bsquare(25)") - - def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() @@ -142,28 +125,6 @@ def test_sized_and_complex_literals(ctx_factory): lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5)) -def test_assume(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "a[i] = a[i] + 1", - [lp.GlobalArg("a", np.float32, shape="n"), "..."]) - - knl = lp.split_iname(knl, "i", 16) - knl = lp.set_loop_priority(knl, "i_outer,i_inner") - knl = lp.assume(knl, "n mod 16 = 0") - knl = lp.assume(knl, "n > 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() - - def test_simple_side_effect(ctx_factory): ctx = ctx_factory() @@ -184,22 +145,6 @@ def test_simple_side_effect(ctx_factory): print(compiled.get_code()) -def test_nonsense_reduction(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<100}", - """ - a[i] = sum(i, 2) - """, - [lp.GlobalArg("a", np.float32, shape=(100,))] - ) - - import pytest - with pytest.raises(RuntimeError): - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - def test_owed_barriers(ctx_factory): ctx = ctx_factory() @@ -243,56 +188,6 @@ def test_wg_too_small(ctx_factory): lp.CompiledKernel(ctx, gen_knl).get_code() -def test_join_inames(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<16}", - [ - "b[i,j] = 2*a[i,j]" - ], - [ - lp.GlobalArg("a", np.float32, shape=(16, 16,)), - lp.GlobalArg("b", np.float32, shape=(16, 16,)) - ], - ) - - ref_knl = knl - - knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) - knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) - - lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) - - -def test_divisibility_assumption(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "[n] -> {[i]: 0<=i<n}", - [ - "b[i] = 2*a[i]" - ], - [ - lp.GlobalArg("a", np.float32, shape=("n",)), - lp.GlobalArg("b", np.float32, shape=("n",)), - lp.ValueArg("n", np.int32), - ], - assumptions="n>=1 and (exists zz: n = 16*zz)") - - ref_knl = knl - - knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code - - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters={"n": 16**3}) - - def test_multi_cse(ctx_factory): ctx = ctx_factory() @@ -315,149 +210,6 @@ def test_multi_cse(ctx_factory): print(compiled.get_code()) -def test_stencil(ctx_factory): - ctx = ctx_factory() - - # n=32 causes corner case behavior in size calculations for temprorary (a - # non-unifiable, two-constant-segments PwAff as the base index) - - n = 256 - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < %d}" % n, - [ - "a_offset(ii, jj) := a[ii+1, jj+1]", - "z[i,j] = -2*a_offset(i,j)" - " + a_offset(i,j-1)" - " + a_offset(i,j+1)" - " + a_offset(i-1,j)" - " + a_offset(i+1,j)" - ], - [ - lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)), - lp.GlobalArg("z", np.float32, shape=(n+2, n+2,)) - ]) - - ref_knl = knl - - def variant_1(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - def variant_2(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - for variant in [ - #variant_1, - variant_2, - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - print_ref_code=False, - op_count=[n*n], op_label=["cells"]) - - -def test_stencil_with_overfetch(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < n}", - [ - "a_offset(ii, jj) := a[ii+2, jj+2]", - "z[i,j] = -2*a_offset(i,j)" - " + a_offset(i,j-1)" - " + a_offset(i,j+1)" - " + a_offset(i-1,j)" - " + a_offset(i+1,j)" - - " + a_offset(i,j-2)" - " + a_offset(i,j+2)" - " + a_offset(i-2,j)" - " + a_offset(i+2,j)" - ], - assumptions="n>=1") - - if ctx.devices[0].platform.name == "Portable Computing Language": - # https://github.com/pocl/pocl/issues/205 - pytest.skip("takes very long to compile on pocl") - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - def variant_overfetch(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", - slabs=(1, 1)) - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", - slabs=(1, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - for variant in [variant_overfetch]: - n = 200 - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - print_ref_code=False, - op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) - - -def test_eq_constraint(ctx_factory): - logging.basicConfig(level=logging.INFO) - - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < 32}", - [ - "a[i] = b[i]" - ], - [ - lp.GlobalArg("a", np.float32, shape=(1000,)), - lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") - knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) - - -def test_argmax(ctx_factory): - logging.basicConfig(level=logging.INFO) - - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 10000 - - knl = lp.make_kernel( - "{[i]: 0<=i<%d}" % n, - """ - max_val, max_idx = argmax(i, fabs(a[i])) - """) - - knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) - print(lp.preprocess_kernel(knl)) - knl = lp.set_options(knl, write_cl=True, highlight_cl=True) - - a = np.random.randn(10000).astype(dtype) - evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) - assert max_val == np.max(np.abs(a)) - assert max_idx == np.where(np.abs(a) == max_val)[-1] - - # {{{ code generator fuzzing def make_random_value(): @@ -571,656 +323,148 @@ def test_fuzz_code_generator(ctx_factory): # }}} -def test_empty_reduction(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<20}", - "[i] -> {[j]: 0<=j<0}" - ], - "a[i] = sum(j, j)", - ) - - knl = lp.realize_reduction(knl) - print(knl) - - knl = lp.set_options(knl, write_cl=True) - evt, (a,) = knl(queue) - - assert (a.get() == 0).all() - - -def test_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) +def test_bare_data_dependency(ctx_factory): + dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ - "{[i]: 0<=i<n}", - "{[j]: 0<=j<i+sumlen}" + "[znirp] -> {[i]: 0<=i<znirp}", ], [ - "<> sumlen = l[i]", - "a[i] = sum(j, j)", + "<> znirp = n", + "a[i] = 1", ], [ + lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("l", np.int32, ("n",)), ]) cknl = lp.CompiledKernel(ctx, knl) + n = 20000 + evt, (a,) = cknl(queue, n=n, out_host=True) - n = 330 - l = np.arange(n, dtype=np.int32) - evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + assert a.shape == (n,) + assert (a == 1).all() - tgt_result = (2*l-1)*2*l/2 - assert (a == tgt_result).all() +# {{{ test race detection -def test_multi_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) +@pytest.mark.skipif("sys.version_info < (2,6)") +def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i,j<n }", [ - "{[itgt]: 0 <= itgt < ntgts}", - "{[isrc_box]: 0 <= isrc_box < nboxes}", - "{[isrc]: 0 <= isrc < npart}" - ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "a[itgt] = sum((isrc_box, isrc), 1)", + "a[i] = 5+i+j", ], [ - lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), - lp.ValueArg("ntgts", np.int32), - lp.ValueArg("nboxes", np.int32), + lp.GlobalArg("a", np.float32), + lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="ntgts>=1") + assumptions="n>=1") - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) - # FIXME: Actually test functionality. + knl = lp.tag_inames(knl, dict(j="ilp")) + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + with lp.CacheMode(False): + from loopy.diagnostic import WriteRaceConditionWarning + from warnings import catch_warnings + with catch_warnings(record=True) as warn_list: + list(lp.generate_loop_schedules(knl)) + assert any(isinstance(w.message, WriteRaceConditionWarning) + for w in warn_list) -def test_recursive_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) + +def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( + "{[i,j]: 0<=i<16 and 0<=j<17 }", [ - "{[itgt]: 0 <= itgt < ntgts}", - "{[isrc_box]: 0 <= isrc_box < nboxes}", - "{[isrc]: 0 <= isrc < npart}" - ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "<> boxsum = sum(isrc, isrc+isrc_box+itgt)", - "a[itgt] = sum(isrc_box, boxsum)", - ], - [ - lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), - lp.ValueArg("ntgts", np.int32), - lp.ValueArg("nboxes", np.int32), + "<> a[i] = 5+i+j", ], - assumptions="ntgts>=1") + []) - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) - # FIXME: Actually test functionality. + knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + assert k.temporary_variables["a"].shape == (16, 17) -@pytest.mark.parametrize("size", [128, 5, 113, 67]) -def test_local_parallel_reduction(ctx_factory, size): +def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i, j]: 0 <= i < n and 0 <= j < 5}", - """ - z[j] = sum(i, i+j) - """) - - knl = lp.fix_parameters(knl, n=size) - - ref_knl = knl - - def variant0(knl): - return lp.tag_inames(knl, "i:l.0") - - def variant1(knl): - return lp.tag_inames(knl, "i:l.0,j:l.1") + "{[j]: 0<=j<16 }", + [ + "<> a = 5+j", + ], + []) - def variant2(knl): - return lp.tag_inames(knl, "i:l.0,j:g.0") + knl = lp.tag_inames(knl, dict(j="ilp")) - for variant in [ - variant0, - variant1, - variant2 - ]: - knl = variant(ref_knl) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + assert k.temporary_variables["a"].shape == (16,) - lp.auto_test_vs_ref(ref_knl, ctx, knl) +# }}} -# FIXME: Make me a test -@pytest.mark.parametrize("size", [10000]) -def no_test_global_parallel_reduction(ctx_factory, size): +def test_write_parameter(ctx_factory): + dtype = np.float32 ctx = ctx_factory() - queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{[i]: 0 <= i < n }", + "{[i,j]: 0<=i,j<n }", """ - <> key = make_uint2(i, 324830944) {inames=i} - <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} - <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} - z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) - """) - - # ref_knl = knl - - gsize = 128 - knl = lp.split_iname(knl, "i", gsize * 20) - knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") - knl = lp.split_reduction_inward(knl, "i_inner_inner") - knl = lp.split_reduction_inward(knl, "i_inner_outer") - from loopy.transform.data import reduction_arg_to_subst_rule - knl = reduction_arg_to_subst_rule(knl, "i_outer") - knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") - print(knl) - 1/0 - knl = lp.realize_reduction(knl) + a = sum((i,j), i*j) + b = sum(i, sum(j, i*j)) + n = 15 + """, + [ + lp.GlobalArg("a", dtype, shape=()), + lp.GlobalArg("b", dtype, shape=()), + lp.ValueArg("n", np.int32, approximately=1000), + ], + assumptions="n>=1") - evt, (z,) = knl(queue, n=size) + import pytest + with pytest.raises(RuntimeError): + lp.CompiledKernel(ctx, knl).get_code() - #lp.auto_test_vs_ref(ref_knl, ctx, knl) +# {{{ arg guessing -@pytest.mark.parametrize("size", [10000]) -def test_global_parallel_reduction_simpler(ctx_factory, size): +def test_arg_shape_guessing(ctx_factory): ctx = ctx_factory() - queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", + "{[i,j]: 0<=i,j<n }", """ - <> key = make_uint2(l+nl*g, 1234) {inames=l:g} - <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} - <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + a = 1.5 + sum((i,j), i*j) + b[i, j] = i*j + c[i+j, j] = b[j,i] + """, + [ + lp.GlobalArg("a", shape=lp.auto), + lp.GlobalArg("b", shape=lp.auto), + lp.GlobalArg("c", shape=lp.auto), + lp.ValueArg("n"), + ], + assumptions="n>=1") - <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) + print(knl) + print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - result = sum(j, tmp[j]) - """) - ng = 50 - knl = lp.fix_parameters(knl, ng=ng) - - knl = lp.set_options(knl, write_cl=True) - - ref_knl = knl - - knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") - knl = lp.split_reduction_outward(knl, "l_inner") - knl = lp.tag_inames(knl, "g:g.0,j:l.0") - - evt, (result,) = knl(queue, nl=size) - evt, (result_ref,) = ref_knl(queue, nl=size) - - nsamples = size * 2 * ng - print(result.get()/nsamples, result_ref.get()/nsamples) - assert abs(result.get() - result_ref.get()) / abs(result_ref.get()) < 1e-5 - - # FIXME: auto_test breaks - #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size}) - - -def test_dependent_loop_bounds(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", - "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", - ], - [ - lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), - lp.GlobalArg("a_indices", np.int32, shape=lp.auto), - lp.GlobalArg("a_values", dtype), - lp.GlobalArg("a_sum", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], - assumptions="n>=1 and row_len>=1") - - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") - - -def test_dependent_loop_bounds_2(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_start = a_rowstarts[i]", - "<> row_len = a_rowstarts[i+1] - row_start", - "ax[i] = sum(jj, a_values[[row_start+jj]])", - ], - [ - lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), - lp.GlobalArg("a_indices", np.int32, shape=lp.auto), - lp.GlobalArg("a_values", dtype, strides=(1,)), - lp.GlobalArg("ax", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], - assumptions="n>=1 and row_len>=1") - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", - inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") - - -def test_dependent_loop_bounds_3(ctx_factory): - # The point of this test is that it shows a dependency between - # domains that is exclusively mediated by the row_len temporary. - # It also makes sure that row_len gets read before any - # conditionals use it. - - dtype = np.dtype(np.float32) - ctx = ctx_factory() - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_len = a_row_lengths[i]", - "a[i,jj] = 1", - ], - [ - lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), - lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - assert knl.parents_per_domain()[1] == 0 - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", - inner_tag="l.0") - - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") - - knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", - inner_tag="l.1") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - import pytest - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) - - -def test_independent_multi_domain(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[j]: 0<=j<n}", - ], - [ - "a[i] = 1", - "b[j] = 2", - ], - [ - lp.GlobalArg("a", dtype, shape=("n"), order="C"), - lp.GlobalArg("b", dtype, shape=("n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", - inner_tag="l.0") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", - inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] - - n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) - - assert a.shape == (50,) - assert b.shape == (50,) - assert (a == 1).all() - assert (b == 2).all() - - -def test_bare_data_dependency(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - [ - "[znirp] -> {[i]: 0<=i<znirp}", - ], - [ - "<> znirp = n", - "a[i] = 1", - ], - [ - lp.GlobalArg("a", dtype, shape=("n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - cknl = lp.CompiledKernel(ctx, knl) - n = 20000 - evt, (a,) = cknl(queue, n=n, out_host=True) - - assert a.shape == (n,) - assert (a == 1).all() - - -def test_equality_constraints(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "[n] -> {[i,j]: 0<=i,j<n }", - "{[k]: k =i+5 and k < n}", - ], - [ - "a[i,j] = 5 {id=set_all}", - "b[i,k] = 22 {dep=set_all}", - ], - [ - lp.GlobalArg("a,b", dtype, shape="n, n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - name="equality_constraints", assumptions="n>=1") - - seq_knl = knl - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - #print(knl) - #print(knl.domains[0].detect_equalities()) - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n), print_ref_code=True) - - -def test_stride(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "{[i]: 0<=i<n and (exists l: i = 2*l)}", - ], - [ - "a[i] = 5", - ], - [ - lp.GlobalArg("a", dtype, shape="n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - seq_knl = knl - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n)) - - -def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "{[i]: 0<=i<n }", - "{[k]: k=i and (exists l: k = 2*l) }", - ], - [ - "a[i] = 5 {id=set}", - "b[k] = 6 {dep=set}", - ], - [ - lp.GlobalArg("a,b", dtype, shape="n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - seq_knl = knl - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n)) - - -def test_double_sum(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 20 - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - [ - "a = simul_reduce(sum, (i,j), i*j)", - "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))", - ], - assumptions="n>=1") - - evt, (a, b) = knl(queue, n=n) - - ref = sum(i*j for i in range(n) for j in range(n)) - assert a.get() == ref - assert b.get() == ref - - -@pytest.mark.parametrize(("op_name", "np_op"), [ - ("sum", np.sum), - ("product", np.prod), - ("min", np.min), - ("max", np.max), - ]) -def test_reduction_library(ctx_factory, op_name, np_op): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<m }", - [ - "res[i] = reduce(%s, j, a[i,j])" % op_name, - ], - assumptions="n>=1") - - a = np.random.randn(20, 10) - evt, (res,) = knl(queue, a=a) - - assert np.allclose(res, np_op(a, axis=1)) - - -def test_double_sum_made_unique(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 20 - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - [ - "a = sum((i,j), i*j)", - "b = sum(i, sum(j, i*j))", - ], - assumptions="n>=1") - - knl = lp.make_reduction_inames_unique(knl) - print(knl) - - evt, (a, b) = knl(queue, n=n) - - ref = sum(i*j for i in range(n) for j in range(n)) - assert a.get() == ref - assert b.get() == ref - - -# {{{ test race detection - -@pytest.mark.skipif("sys.version_info < (2,6)") -def test_ilp_write_race_detection_global(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "[n] -> {[i,j]: 0<=i,j<n }", - [ - "a[i] = 5+i+j", - ], - [ - lp.GlobalArg("a", np.float32), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - knl = lp.tag_inames(knl, dict(j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - with lp.CacheMode(False): - from loopy.diagnostic import WriteRaceConditionWarning - from warnings import catch_warnings - with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) - - assert any(isinstance(w.message, WriteRaceConditionWarning) - for w in warn_list) - - -def test_ilp_write_race_avoidance_local(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<16 and 0<=j<17 }", - [ - "<> a[i] = 5+i+j", - ], - []) - - knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) - - -def test_ilp_write_race_avoidance_private(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[j]: 0<=j<16 }", - [ - "<> a = 5+j", - ], - []) - - knl = lp.tag_inames(knl, dict(j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) - -# }}} - - -def test_write_parameter(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - """ - a = sum((i,j), i*j) - b = sum(i, sum(j, i*j)) - n = 15 - """, - [ - lp.GlobalArg("a", dtype, shape=()), - lp.GlobalArg("b", dtype, shape=()), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - import pytest - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() - - -# {{{ arg guessing - -def test_arg_shape_guessing(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - """ - a = 1.5 + sum((i,j), i*j) - b[i, j] = i*j - c[i+j, j] = b[j,i] - """, - [ - lp.GlobalArg("a", shape=lp.auto), - lp.GlobalArg("b", shape=lp.auto), - lp.GlobalArg("c", shape=lp.auto), - lp.ValueArg("n"), - ], - assumptions="n>=1") - - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - - -def test_arg_guessing(ctx_factory): - ctx = ctx_factory() +def test_arg_guessing(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", @@ -1273,18 +517,6 @@ def test_nonlinear_index(ctx_factory): print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) -def test_triangle_domain(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n and i <= j}", - "a[i,j] = 17", - assumptions="n>=1") - - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - - def test_offsets_and_slicing(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1348,129 +580,6 @@ def test_vector_ilp_with_prefetch(ctx_factory): assert len(list(re.finditer("barrier", code))) == 1 -# {{{ convolutions - -def test_convolution(ctx_factory): - ctx = ctx_factory() - - dtype = np.float32 - - knl = lp.make_kernel( - "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ - -f_w <= f_x,f_y <= f_w \ - and 0 <= im_x < im_w and 0 <= im_y < im_h \ - and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ - }", - """ - out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ - img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ - * f[ifeat, f_w+f_x, f_w+f_y, icolor]) - """, - [ - lp.GlobalArg("f", dtype, shape=lp.auto), - lp.GlobalArg("img", dtype, shape=lp.auto), - lp.GlobalArg("out", dtype, shape=lp.auto), - "..." - ], - assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", - flags="annotate_inames", - defines=dict(ncolors=3)) - - f_w = 3 - - knl = lp.fix_parameters(knl, f_w=f_w) - - ref_knl = knl - - def variant_0(knl): - #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") - return knl - - def variant_1(knl): - knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") - return knl - - def variant_2(knl): - knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") - knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") - return knl - - for variant in [ - variant_0, - variant_1, - variant_2 - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - parameters=dict( - im_w=128, im_h=128, f_w=f_w, - nfeats=3, nimgs=3 - )) - - -def test_convolution_with_nonzero_base(ctx_factory): - # This is kept alive as a test for domains that don't start at zero. - # These are a bad idea for split_iname, which places its origin at zero - # and therefore produces a first block that is odd-sized. - # - # Therefore, for real tests, check test_convolution further up. - - ctx = ctx_factory() - - dtype = np.float32 - - knl = lp.make_kernel( - "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ - -f_w <= f_x,f_y <= f_w \ - and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ - and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ - }", - """ - out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \ - img[iimg, im_x-f_x, im_y-f_y, icolor] \ - * f[ifeat, f_w+f_x, f_w+f_y, icolor]) - """, - [ - lp.GlobalArg("f", dtype, shape=lp.auto), - lp.GlobalArg("img", dtype, shape=lp.auto), - lp.GlobalArg("out", dtype, shape=lp.auto), - "..." - ], - assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", - flags="annotate_inames", - defines=dict(ncolors=3)) - - ref_knl = knl - - f_w = 3 - - def variant_0(knl): - #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") - return knl - - def variant_1(knl): - knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") - return knl - - for variant in [ - variant_0, - variant_1, - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - parameters=dict( - im_w=128, im_h=128, f_w=f_w, - nfeats=12, nimgs=17 - )) - -# }}} - - def test_c_instruction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() @@ -1529,36 +638,21 @@ def test_dependent_domain_insn_iname_finding(ctx_factory): ))) -def test_inames_deps_from_write_subscript(ctx_factory): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - """ - <> src_ibox = source_boxes[i] - <int32> something = 5 - a[src_ibox] = sum(j, something) {id=myred} - """, - [ - lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", - None, shape=None), - "..."]) - - print(knl) - assert "i" in knl.insn_inames("myred") - - -def test_split_reduction(ctx_factory): +def test_inames_deps_from_write_subscript(ctx_factory): knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j,k<n}", + "{[i,j]: 0<=i,j<n}", """ - b = sum((i,j,k), a[i,j,k]) + <> src_ibox = source_boxes[i] + <int32> something = 5 + a[src_ibox] = sum(j, something) {id=myred} """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), "..."]) - knl = lp.split_reduction_outward(knl, "j,k") - # FIXME: finish test + print(knl) + assert "i" in knl.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -1582,132 +676,6 @@ def test_modulo_indexing(ctx_factory): ))) -def test_rob_stroud_bernstein(ctx_factory): - ctx = ctx_factory() - - # NOTE: tmp would have to be zero-filled beforehand - - knl = lp.make_kernel( - "{[el, i2, alpha1,alpha2]: \ - 0 <= el < nels and \ - 0 <= i2 < nqp1d and \ - 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", - """ - <> xi = qpts[1, i2] {inames=+el} - <> s = 1-xi - <> r = xi/s - <> aind = 0 {id=aind_init,inames=+i2:el} - - <> w = s**(deg-alpha1) {id=init_w} - - tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp,inames=+alpha2} - w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ - {id=update_w,dep=init_w:write_tmp} - aind = aind + 1 \ - {id=aind_incr,\ - dep=aind_init:write_tmp:update_w, \ - inames=+el:i2:alpha1:alpha2} - """, - [ - # Must declare coeffs to have "no" shape, to keep loopy - # from trying to figure it out the shape automatically. - - lp.GlobalArg("coeffs", None, shape=None), - "..." - ], - assumptions="deg>=0 and nels>=1" - ) - - knl = lp.fix_parameters(knl, nqp1d=7, deg=4) - knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") - knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", - slabs=(0, 1)) - knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - qpts=np.float32, - coeffs=np.float32, - tmp=np.float32, - ))) - - -def test_rob_stroud_bernstein_full(ctx_factory): - #logging.basicConfig(level=logging.DEBUG) - ctx = ctx_factory() - - # NOTE: result would have to be zero-filled beforehand - - knl = lp.make_kernel( - "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ - 0 <= el < nels and \ - 0 <= i2 < nqp1d and \ - 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ - \ - 0 <= i1_2 < nqp1d and \ - 0 <= alpha1_2 <= deg and \ - 0 <= i2_2 < nqp1d \ - }", - """ - <> xi = qpts[1, i2] {inames=+el} - <> s = 1-xi - <> r = xi/s - <> aind = 0 {id=aind_init,inames=+i2:el} - - <> w = s**(deg-alpha1) {id=init_w} - - <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp,inames=+alpha2} - w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ - {id=update_w,dep=init_w:write_tmp} - aind = aind + 1 \ - {id=aind_incr,\ - dep=aind_init:write_tmp:update_w, \ - inames=+el:i2:alpha1:alpha2} - - <> xi2 = qpts[0, i1_2] {dep=aind_incr,inames=+el} - <> s2 = 1-xi2 - <> r2 = xi2/s2 - <> w2 = s2**deg - - result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ - w2 * tmp[alpha1_2, i2_2] \ - {inames=el:alpha1_2:i1_2:i2_2} - - w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) - """, - [ - # Must declare coeffs to have "no" shape, to keep loopy - # from trying to figure it out the shape automatically. - - lp.GlobalArg("coeffs", None, shape=None), - "..." - ], - assumptions="deg>=0 and nels>=1" - ) - - knl = lp.fix_parameters(knl, nqp1d=7, deg=4) - - if 0: - knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") - knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", - slabs=(0, 1)) - knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - from pickle import dumps, loads - knl = loads(dumps(knl)) - - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - qpts=np.float32, - tmp=np.float32, - coeffs=np.float32, - result=np.float32, - )) - print(knl) - - @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() @@ -1736,28 +704,6 @@ def test_vector_types(ctx_factory, vec_len): )) -def test_tag_data_axes(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{ [i,j,k]: 0<=i,j,k<n }", - "out[i,j,k] = 15") - - ref_knl = knl - - with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,N5") - - with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,c") - - knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") - knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) - - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters=dict(n=20)) - - def test_conditional(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() @@ -1779,524 +725,152 @@ def test_conditional(ctx_factory): ref_knl = knl - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters=dict( - n=200 - )) - - -def test_ilp_loop_bound(ctx_factory): - # The salient bit of this test is that a joint bound on (outer, inner) - # from a split occurs in a setting where the inner loop has been ilp'ed. - # In 'normal' parallel loops, the inner index is available for conditionals - # throughout. In ILP'd loops, not so much. - - ctx = ctx_factory() - knl = lp.make_kernel( - "{ [i,j,k]: 0<=i,j,k<n }", - """ - out[i,k] = sum(j, a[i,j]*b[j,k]) - """, - [ - lp.GlobalArg("a,b", np.float32, shape=lp.auto), - "...", - ], - assumptions="n>=1") - - ref_knl = knl - - knl = lp.set_loop_priority(knl, "j,i,k") - knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") - - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters=dict( - n=200 - )) - - -def test_arg_shape_uses_assumptions(ctx_factory): - # If arg shape determination does not use assumptions, then it won't find a - # static shape for out, which is at least 1 x 1 in size, but otherwise of - # size n x n. - - lp.make_kernel( - "{ [i,j]: 0<=i,j<n }", - """ - out[i,j] = 2*a[i,j] - out[0,0] = 13.0 - """, assumptions="n>=1") - - -def test_slab_decomposition_does_not_double_execute(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "a[i] = 2*a[i]", - assumptions="n>=1") - - ref_knl = knl - - for outer_tag in ["for", "g.0"]: - knl = ref_knl - knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", - outer_tag=outer_tag) - knl = lp.set_loop_priority(knl, "i_outer") - - a = cl.array.empty(queue, 20, np.float32) - a.fill(17) - a_ref = a.copy() - a_knl = a.copy() - - knl = lp.set_options(knl, write_cl=True) - print("TEST-----------------------------------------") - knl(queue, a=a_knl) - print("REF-----------------------------------------") - ref_knl(queue, a=a_ref) - print("DONE-----------------------------------------") - - print("REF", a_ref) - print("KNL", a_knl) - assert (a_ref == a_knl).get().all() - - print("_________________________________") - - -def test_multiple_writes_to_local_temporary(): - # Loopy would previously only handle barrier insertion correctly if exactly - # one instruction wrote to each local temporary. This tests that multiple - # writes are OK. - - knl = lp.make_kernel( - "{[i,e]: 0<=i<5 and 0<=e<nelements}", - """ - <> temp[i, 0] = 17 - temp[i, 1] = 15 - """) - knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) - - -def test_fd_demo(): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ - + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ - + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") - #assumptions="n mod 16=0") - knl = lp.split_iname(knl, - "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, - "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "u", - ["i_inner", "j_inner"], - fetch_bounding_box=True) - - #n = 1000 - #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) - - knl = lp.set_options(knl, write_cl=True) - knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) - code, inf = lp.generate_code(knl) - print(code) - - assert "double" not in code - - -def test_fd_1d(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "result[i] = u[i+1]-u[i]") - - knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) - ref_knl = knl - - knl = lp.split_iname(knl, "i", 16) - knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") - knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") - knl = lp.assume(knl, "n mod 16 = 0") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=2048)) - - -def test_make_copy_kernel(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - intermediate_format = "f,f,sep" - - a1 = np.random.randn(1024, 4, 3) - - cknl1 = lp.make_copy_kernel(intermediate_format) - - cknl1 = lp.fix_parameters(cknl1, n2=3) - - cknl1 = lp.set_options(cknl1, write_cl=True) - evt, a2 = cknl1(queue, input=a1) - - cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) - cknl2 = lp.fix_parameters(cknl2, n2=3) - - evt, a3 = cknl2(queue, input=a2) - - assert (a1 == a3).all() - - -def test_set_arg_order(): - knl = lp.make_kernel( - "{ [i,j]: 0<=i,j<n }", - "out[i,j] = a[i]*b[j]") - - knl = lp.set_argument_order(knl, "out,a,n,b") - - -def test_affine_map_inames(): - knl = lp.make_kernel( - "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}", - "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]") - - knl = lp.affine_map_inames(knl, - "i", "i0", - "i0 = n+i") - - print(knl) - - -def test_precompute_confusing_subst_arguments(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<5}", - """ - D(i):=a[i+1]-a[i] - b[i,j] = D(j) - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - knl = lp.tag_inames(knl, dict(j="g.1")) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - - from loopy.symbolic import get_dependencies - assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=12345)) - - -def test_precompute_nested_subst(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<5}", - """ - E:=a[i] - D:=E*E - b[i] = D - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - knl = lp.tag_inames(knl, dict(j="g.1")) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - - from loopy.symbolic import get_dependencies - assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D", "i_inner") - - # There's only one surviving 'E' rule. - assert len([ - rule_name - for rule_name in knl.substitutions - if rule_name.startswith("E")]) == 1 - - # That rule should use the newly created prefetch inames, - # not the prior 'i_inner' - assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=12345)) - - -def test_poisson(ctx_factory): - # Stolen from Peter Coogan and Rob Kirby for FEM assembly - ctx = ctx_factory() - - nbf = 5 - nqp = 5 - sdim = 3 - - knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2,ell3]: \ - 0 <= c < nels and \ - 0 <= i < nbf and \ - 0 <= j < nbf and \ - 0 <= k < nqp and \ - 0 <= ell,ell2 < sdim}", - """ - dpsi(bf,k0,dir) := \ - simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) - Ael[c,i,j] = \ - J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) - """, - assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") - - print(knl) - - knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) - - ref_knl = knl - - knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) - - def variant_1(knl): - knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') - knl = lp.set_loop_priority(knl, "c,i,j") - return knl - - def variant_2(knl): - knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') - knl = lp.set_loop_priority(knl, "c,i,j") - return knl - - def add_types(knl): - return lp.add_and_infer_dtypes(knl, dict( - w=np.float32, - J=np.float32, - DPsi=np.float32, - DFinv=np.float32, - )) - - for variant in [ - #variant_1, - variant_2 - ]: - knl = variant(knl) + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters=dict( + n=200 + )) - lp.auto_test_vs_ref( - add_types(ref_knl), ctx, add_types(knl), - parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) +def test_ilp_loop_bound(ctx_factory): + # The salient bit of this test is that a joint bound on (outer, inner) + # from a split occurs in a setting where the inner loop has been ilp'ed. + # In 'normal' parallel loops, the inner index is available for conditionals + # throughout. In ILP'd loops, not so much. -def test_auto_test_can_detect_problems(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - """ - a[i,j] = 25 - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + "{ [i,j,k]: 0<=i,j,k<n }", + """ + out[i,k] = sum(j, a[i,j]*b[j,k]) + """, + [ + lp.GlobalArg("a,b", np.float32, shape=lp.auto), + "...", + ], + assumptions="n>=1") ref_knl = knl - knl = lp.link_inames(knl, "i,j", "i0") - - from loopy.diagnostic import AutomaticTestFailure - with pytest.raises(AutomaticTestFailure): - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=123)) - - -def test_generate_c_snippet(): - from loopy.target.c import CTarget - - from pymbolic import var - I = var("I") # noqa - f = var("f") - df = var("df") - q_v = var("q_v") - eN = var("eN") # noqa - k = var("k") - u = var("u") - - from functools import partial - l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) - - Instr = lp.Assignment # noqa + knl = lp.set_loop_priority(knl, "j,i,k") + knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") - knl = lp.make_kernel( - "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", - [ - Instr(f[I], l_sum(k, q_v[k, I]*u)), - Instr(df[I], l_sum(k, q_v[k, I])), - ], - [ - lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), - lp.GlobalArg("f,df", np.float64, shape="nSpace"), - lp.ValueArg("u", np.float64), - "...", - ], - target=CTarget(), - assumptions="nQuad>=1") + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters=dict( + n=200 + )) - if 0: # enable to play with prefetching - # (prefetch currently requires constant sizes) - knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) - knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) - knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) - knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") +def test_arg_shape_uses_assumptions(ctx_factory): + # If arg shape determination does not use assumptions, then it won't find a + # static shape for out, which is at least 1 x 1 in size, but otherwise of + # size n x n. - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) + lp.make_kernel( + "{ [i,j]: 0<=i,j<n }", + """ + out[i,j] = 2*a[i,j] + out[0,0] = 13.0 + """, assumptions="n>=1") -def test_precompute_with_preexisting_inames(ctx_factory): +def test_slab_decomposition_does_not_double_execute(ctx_factory): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", - """ - result[e,i] = sum(j, D1[i,j]*u[e,j]) - result2[e,i] = sum(k, D2[i,k]*u[e,k]) - """) + "{ [i]: 0<=i<n }", + "a[i] = 2*a[i]", + assumptions="n>=1") - knl = lp.add_and_infer_dtypes(knl, { - "u": np.float32, - "D1": np.float32, - "D2": np.float32, - }) + ref_knl = knl - knl = lp.fix_parameters(knl, n=13) + for outer_tag in ["for", "g.0"]: + knl = ref_knl + knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", + outer_tag=outer_tag) + knl = lp.set_loop_priority(knl, "i_outer") - ref_knl = knl + a = cl.array.empty(queue, 20, np.float32) + a.fill(17) + a_ref = a.copy() + a_knl = a.copy() - knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") - knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + knl = lp.set_options(knl, write_cl=True) + print("TEST-----------------------------------------") + knl(queue, a=a_knl) + print("REF-----------------------------------------") + ref_knl(queue, a=a_ref) + print("DONE-----------------------------------------") - knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", - precompute_inames="ii,jj") - knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", - precompute_inames="ii,jj") + print("REF", a_ref) + print("KNL", a_knl) + assert (a_ref == a_knl).get().all() - knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") + print("_________________________________") - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(E=200)) +def test_multiple_writes_to_local_temporary(): + # Loopy would previously only handle barrier insertion correctly if exactly + # one instruction wrote to each local temporary. This tests that multiple + # writes are OK. -def test_precompute_with_preexisting_inames_fail(): knl = lp.make_kernel( - "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", + "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ - result[e,i] = sum(j, D1[i,j]*u[e,j]) - result2[e,i] = sum(k, D2[i,k]*u[e,k]) + <> temp[i, 0] = 17 + temp[i, 1] = 15 """) + knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.add_and_infer_dtypes(knl, { - "u": np.float32, - "D1": np.float32, - "D2": np.float32, - }) + knl = lp.preprocess_kernel(knl) + for k in lp.generate_loop_schedules(knl): + code, _ = lp.generate_code(k) + print(code) - knl = lp.fix_parameters(knl, n=13) - knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") - knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") +def test_make_copy_kernel(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", - precompute_inames="ii,jj") - with pytest.raises(lp.LoopyError): - lp.precompute(knl, "D2_subst", "i,k", default_tag="for", - precompute_inames="ii,jj") + intermediate_format = "f,f,sep" + a1 = np.random.randn(1024, 4, 3) -def test_vectorize(ctx_factory): - ctx = ctx_factory() + cknl1 = lp.make_copy_kernel(intermediate_format) - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - """ - <> temp = 2*b[i] - a[i] = temp - """) - knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) - knl = lp.set_array_dim_names(knl, "a,b", "i") - knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, - split_kwargs=dict(slabs=(0, 1))) + cknl1 = lp.fix_parameters(cknl1, n2=3) - knl = lp.tag_data_axes(knl, "a,b", "c,vec") - ref_knl = knl - ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) + cknl1 = lp.set_options(cknl1, write_cl=True) + evt, a2 = cknl1(queue, input=a1) - knl = lp.tag_inames(knl, {"i_inner": "vec"}) + cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) + cknl2 = lp.fix_parameters(cknl2, n2=3) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - code, inf = lp.generate_code(knl) + evt, a3 = cknl2(queue, input=a2) - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=30)) + assert (a1 == a3).all() -def test_alias_temporaries(ctx_factory): +def test_auto_test_can_detect_problems(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( - "{[i]: 0<=i<n}", + "{[i,j]: 0<=i,j<n}", """ - times2(i) := 2*a[i] - times3(i) := 3*a[i] - times4(i) := 4*a[i] - - x[i] = times2(i) - y[i] = times3(i) - z[i] = times4(i) + a[i,j] = 25 """) - knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") - - knl = lp.precompute(knl, "times2", "i_inner") - knl = lp.precompute(knl, "times3", "i_inner") - knl = lp.precompute(knl, "times4", "i_inner") - - knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=30)) - - -def test_fusion(): - exp_kernel = lp.make_kernel( - ''' { [i]: 0<=i<n } ''', - ''' exp[i] = pow(E, z[i])''', - assumptions="n>0") - - sum_kernel = lp.make_kernel( - '{ [j]: 0<=j<n }', - 'out2 = sum(j, exp[j])', - assumptions='n>0') - - knl = lp.fuse_kernels([exp_kernel, sum_kernel]) + knl = lp.link_inames(knl, "i,j", "i0") - print(knl) + from loopy.diagnostic import AutomaticTestFailure + with pytest.raises(AutomaticTestFailure): + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=123)) def test_sci_notation_literal(ctx_factory): @@ -2314,37 +888,6 @@ def test_sci_notation_literal(ctx_factory): assert (np.abs(out.get() - 1e-12) < 1e-20).all() -def test_rename_argument(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - kernel = lp.make_kernel( - '''{ [i]: 0<=i<n }''', - '''out[i] = a + 2''') - - kernel = lp.rename_argument(kernel, "a", "b") - - evt, (out,) = kernel(queue, b=np.float32(12), n=20) - - assert (np.abs(out.get() - 14) < 1e-8).all() - - -def test_to_batched(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - ''' { [i,j]: 0<=i,j<n } ''', - ''' out[i] = sum(j, a[i,j]*x[j])''') - - bknl = lp.to_batched(knl, "nbatches", "out,x") - - a = np.random.randn(5, 5) - x = np.random.randn(7, 5) - - bknl(queue, a=a, x=x) - - def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', @@ -2399,57 +942,6 @@ def test_indexof_vec(ctx_factory): #assert np.array_equal(out.ravel(order="C"), np.arange(25)) -def test_finite_difference_expr_subst(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) - h = grid[1] - grid[0] - u = cl.clmath.sin(cl.array.to_device(queue, grid)) - - fin_diff_knl = lp.make_kernel( - "{[i]: 1<=i<=n}", - "out[i] = -(f[i+1] - f[i-1])/h", - [lp.GlobalArg("out", shape="n+2"), "..."]) - - flux_knl = lp.make_kernel( - "{[j]: 1<=j<=n}", - "f[j] = u[j]**2/2", - [ - lp.GlobalArg("f", shape="n+2"), - lp.GlobalArg("u", shape="n+2"), - ]) - - fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], - data_flow=[ - ("f", 1, 0) - ]) - - fused_knl = lp.set_options(fused_knl, write_cl=True) - evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) - - fused_knl = lp.assignment_to_subst(fused_knl, "f") - - fused_knl = lp.set_options(fused_knl, write_cl=True) - - # This is the real test here: The automatically generated - # shape expressions are '2+n' and the ones above are 'n+2'. - # Is loopy smart enough to understand that these are equal? - evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) - - fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") - - gpu_knl = lp.split_iname( - fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") - - precomp_knl = lp.precompute( - gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) - - precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) - precomp_knl = lp.set_options(precomp_knl, return_dict=True) - evt, _ = precomp_knl(queue, u=u, h=h) - - def test_is_expression_equal(): from loopy.symbolic import is_expression_equal from pymbolic import var @@ -2463,92 +955,6 @@ def test_is_expression_equal(): assert is_expression_equal((x+y)**2, x**2 + 2*x*y + y**2) -def test_collect_common_factors(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j<n}", - """ - <float32> out_tmp = 0 {id=out_init,inames=i} - out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} - out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init} - out[i] = out_tmp {dep=out_up1:out_up2} - """) - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32)) - - ref_knl = knl - - knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") - knl = lp.collect_common_factors_on_increment(knl, "out_tmp") - - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) - - -def test_ispc_target(occa_mode=False): - from loopy.target.ispc import ISPCTarget - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - target=ISPCTarget(occa_mode=occa_mode)) - - knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") - knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) - - print(codegen_result.device_code()) - print(codegen_result.host_code()) - - -def test_cuda_target(): - from loopy.target.cuda import CudaTarget - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - target=CudaTarget()) - - knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") - knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - - print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) - - -def test_chunk_iname(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - assumptions="n>0") - - ref_knl = knl - knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "i_outer, i_inner") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) - - @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) def test_atomic(ctx_factory, dtype): ctx = ctx_factory() @@ -2580,23 +986,6 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) -def test_clamp(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 15 * 10**6 - x = cl.clrandom.rand(queue, n, dtype=np.float32) - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = clamp(x[i], a, b)") - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.set_options(knl, write_cl=True) - - evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) - - def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 @@ -2632,42 +1021,6 @@ def test_forced_iname_deps_and_reduction(): print(k.stringify(with_dependencies=True)) -@pytest.mark.parametrize("tp", ["f32", "f64"]) -def test_random123(ctx_factory, tp): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - import pyopencl.version # noqa - if cl.version.VERSION < (2016, 2): - pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") - - n = 150000 - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - """ - <> key2 = make_uint2(i, 324830944) {inames=i} - <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} - <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} - <> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr} - <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr} - - out[i, 0] = real.s0 + 1j * imag.s0 - out[i, 1] = real.s1 + 1j * imag.s1 - out[i, 2] = real.s2 + 1j * imag.s2 - out[i, 3] = real.s3 + 1j * imag.s3 - """.replace("TYPE", tp)) - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.set_options(knl, write_cl=True) - - evt, (out,) = knl(queue, n=n) - - out = out.get() - assert (out < 1).all() - assert (0 <= out).all() - - def test_kernel_splitting(ctx_factory): ctx = ctx_factory() diff --git a/test/test_reduction.py b/test/test_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..d0edbf08ccd9cbc53669c0bd418315db50eca5d7 --- /dev/null +++ b/test/test_reduction.py @@ -0,0 +1,554 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_nonsense_reduction(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<100}", + """ + a[i] = sum(i, 2) + """, + [lp.GlobalArg("a", np.float32, shape=(100,))] + ) + + import pytest + with pytest.raises(RuntimeError): + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + +def test_empty_reduction(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<20}", + "[i] -> {[j]: 0<=j<0}" + ], + "a[i] = sum(j, j)", + ) + + knl = lp.realize_reduction(knl) + print(knl) + + knl = lp.set_options(knl, write_cl=True) + evt, (a,) = knl(queue) + + assert (a.get() == 0).all() + + +def test_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[j]: 0<=j<i+sumlen}" + ], + [ + "<> sumlen = l[i]", + "a[i] = sum(j, j)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("l", np.int32, ("n",)), + ]) + + cknl = lp.CompiledKernel(ctx, knl) + + n = 330 + l = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + + tgt_result = (2*l-1)*2*l/2 + assert (a == tgt_result).all() + + +def test_multi_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[itgt]: 0 <= itgt < ntgts}", + "{[isrc_box]: 0 <= isrc_box < nboxes}", + "{[isrc]: 0 <= isrc < npart}" + ], + [ + "<> npart = nparticles_per_box[isrc_box]", + "a[itgt] = sum((isrc_box, isrc), 1)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), + lp.ValueArg("ntgts", np.int32), + lp.ValueArg("nboxes", np.int32), + ], + assumptions="ntgts>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print(cknl.get_code()) + # FIXME: Actually test functionality. + + +def test_recursive_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[itgt]: 0 <= itgt < ntgts}", + "{[isrc_box]: 0 <= isrc_box < nboxes}", + "{[isrc]: 0 <= isrc < npart}" + ], + [ + "<> npart = nparticles_per_box[isrc_box]", + "<> boxsum = sum(isrc, isrc+isrc_box+itgt)", + "a[itgt] = sum(isrc_box, boxsum)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), + lp.ValueArg("ntgts", np.int32), + lp.ValueArg("nboxes", np.int32), + ], + assumptions="ntgts>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print(cknl.get_code()) + # FIXME: Actually test functionality. + + +@pytest.mark.parametrize("size", [128, 5, 113, 67]) +def test_local_parallel_reduction(ctx_factory, size): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i, j]: 0 <= i < n and 0 <= j < 5}", + """ + z[j] = sum(i, i+j) + """) + + knl = lp.fix_parameters(knl, n=size) + + ref_knl = knl + + def variant0(knl): + return lp.tag_inames(knl, "i:l.0") + + def variant1(knl): + return lp.tag_inames(knl, "i:l.0,j:l.1") + + def variant2(knl): + return lp.tag_inames(knl, "i:l.0,j:g.0") + + for variant in [ + variant0, + variant1, + variant2 + ]: + knl = variant(ref_knl) + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +# FIXME: Make me a test +@pytest.mark.parametrize("size", [10000]) +def no_test_global_parallel_reduction(ctx_factory, size): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < n }", + """ + <> key = make_uint2(i, 324830944) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) + """) + + # ref_knl = knl + + gsize = 128 + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + knl = lp.split_reduction_inward(knl, "i_inner_outer") + from loopy.transform.data import reduction_arg_to_subst_rule + knl = reduction_arg_to_subst_rule(knl, "i_outer") + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") + print(knl) + 1/0 + knl = lp.realize_reduction(knl) + + evt, (z,) = knl(queue, n=size) + + #lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +@pytest.mark.parametrize("size", [10000]) +def test_global_parallel_reduction_simpler(ctx_factory, size): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", + """ + <> key = make_uint2(l+nl*g, 1234) {inames=l:g} + <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + + <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) + + result = sum(j, tmp[j]) + """) + + ng = 50 + knl = lp.fix_parameters(knl, ng=ng) + + knl = lp.set_options(knl, write_cl=True) + + ref_knl = knl + + knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "l_inner") + knl = lp.tag_inames(knl, "g:g.0,j:l.0") + + evt, (result,) = knl(queue, nl=size) + evt, (result_ref,) = ref_knl(queue, nl=size) + + nsamples = size * 2 * ng + print(result.get()/nsamples, result_ref.get()/nsamples) + assert abs(result.get() - result_ref.get()) / abs(result_ref.get()) < 1e-5 + + # FIXME: auto_test breaks + #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size}) + + +def test_argmax(ctx_factory): + logging.basicConfig(level=logging.INFO) + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 10000 + + knl = lp.make_kernel( + "{[i]: 0<=i<%d}" % n, + """ + max_val, max_idx = argmax(i, fabs(a[i])) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + print(lp.preprocess_kernel(knl)) + knl = lp.set_options(knl, write_cl=True, highlight_cl=True) + + a = np.random.randn(10000).astype(dtype) + evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) + assert max_val == np.max(np.abs(a)) + assert max_idx == np.where(np.abs(a) == max_val)[-1] + + +def test_simul_reduce(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 20 + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + [ + "a = simul_reduce(sum, (i,j), i*j)", + "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))", + ], + assumptions="n>=1") + + evt, (a, b) = knl(queue, n=n) + + ref = sum(i*j for i in range(n) for j in range(n)) + assert a.get() == ref + assert b.get() == ref + + +@pytest.mark.parametrize(("op_name", "np_op"), [ + ("sum", np.sum), + ("product", np.prod), + ("min", np.min), + ("max", np.max), + ]) +def test_reduction_library(ctx_factory, op_name, np_op): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<m }", + [ + "res[i] = reduce(%s, j, a[i,j])" % op_name, + ], + assumptions="n>=1") + + a = np.random.randn(20, 10) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np_op(a, axis=1)) + + +def test_split_reduction(ctx_factory): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j,k<n}", + """ + b = sum((i,j,k), a[i,j,k]) + """, + [ + lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", + None, shape=None), + "..."]) + + knl = lp.split_reduction_outward(knl, "j,k") + # FIXME: finish test + + +def test_double_sum_made_unique(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 20 + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + [ + "a = sum((i,j), i*j)", + "b = sum(i, sum(j, i*j))", + ], + assumptions="n>=1") + + knl = lp.make_reduction_inames_unique(knl) + print(knl) + + evt, (a, b) = knl(queue, n=n) + + ref = sum(i*j for i in range(n) for j in range(n)) + assert a.get() == ref + assert b.get() == ref + + +def test_fd_demo(): + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n}", + "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") + #assumptions="n mod 16=0") + knl = lp.split_iname(knl, + "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, + "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "u", + ["i_inner", "j_inner"], + fetch_bounding_box=True) + + #n = 1000 + #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) + + knl = lp.set_options(knl, write_cl=True) + knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) + code, inf = lp.generate_code(knl) + print(code) + + assert "double" not in code + + +def test_fd_1d(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "result[i] = u[i+1]-u[i]") + + knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") + knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") + knl = lp.assume(knl, "n mod 16 = 0") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=2048)) + + +def test_poisson_fem(ctx_factory): + # Stolen from Peter Coogan and Rob Kirby for FEM assembly + ctx = ctx_factory() + + nbf = 5 + nqp = 5 + sdim = 3 + + knl = lp.make_kernel( + "{ [c,i,j,k,ell,ell2,ell3]: \ + 0 <= c < nels and \ + 0 <= i < nbf and \ + 0 <= j < nbf and \ + 0 <= k < nqp and \ + 0 <= ell,ell2 < sdim}", + """ + dpsi(bf,k0,dir) := \ + simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) + Ael[c,i,j] = \ + J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) + """, + assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") + + print(knl) + + knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) + + ref_knl = knl + + knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) + + def variant_1(knl): + knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') + knl = lp.set_loop_priority(knl, "c,i,j") + return knl + + def variant_2(knl): + knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') + knl = lp.set_loop_priority(knl, "c,i,j") + return knl + + def add_types(knl): + return lp.add_and_infer_dtypes(knl, dict( + w=np.float32, + J=np.float32, + DPsi=np.float32, + DFinv=np.float32, + )) + + for variant in [ + #variant_1, + variant_2 + ]: + knl = variant(knl) + + lp.auto_test_vs_ref( + add_types(ref_knl), ctx, add_types(knl), + parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) + + +def test_finite_difference_expr_subst(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) + h = grid[1] - grid[0] + u = cl.clmath.sin(cl.array.to_device(queue, grid)) + + fin_diff_knl = lp.make_kernel( + "{[i]: 1<=i<=n}", + "out[i] = -(f[i+1] - f[i-1])/h", + [lp.GlobalArg("out", shape="n+2"), "..."]) + + flux_knl = lp.make_kernel( + "{[j]: 1<=j<=n}", + "f[j] = u[j]**2/2", + [ + lp.GlobalArg("f", shape="n+2"), + lp.GlobalArg("u", shape="n+2"), + ]) + + fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], + data_flow=[ + ("f", 1, 0) + ]) + + fused_knl = lp.set_options(fused_knl, write_cl=True) + evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) + + fused_knl = lp.assignment_to_subst(fused_knl, "f") + + fused_knl = lp.set_options(fused_knl, write_cl=True) + + # This is the real test here: The automatically generated + # shape expressions are '2+n' and the ones above are 'n+2'. + # Is loopy smart enough to understand that these are equal? + evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) + + fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") + + gpu_knl = lp.split_iname( + fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") + + precomp_knl = lp.precompute( + gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) + + precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) + precomp_knl = lp.set_options(precomp_knl, return_dict=True) + evt, _ = precomp_knl(queue, u=u, h=h) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_target.py b/test/test_target.py new file mode 100644 index 0000000000000000000000000000000000000000..67204d9c1fbc02624eebf230b87d33c114c21ef1 --- /dev/null +++ b/test/test_target.py @@ -0,0 +1,203 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_ispc_target(occa_mode=False): + from loopy.target.ispc import ISPCTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=ISPCTarget(occa_mode=occa_mode)) + + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + codegen_result = lp.generate_code_v2( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl))) + + print(codegen_result.device_code()) + print(codegen_result.host_code()) + + +def test_cuda_target(): + from loopy.target.cuda import CudaTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=CudaTarget()) + + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + print( + lp.generate_code( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl)))[0]) + + +def test_generate_c_snippet(): + from loopy.target.c import CTarget + + from pymbolic import var + I = var("I") # noqa + f = var("f") + df = var("df") + q_v = var("q_v") + eN = var("eN") # noqa + k = var("k") + u = var("u") + + from functools import partial + l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) + + Instr = lp.Assignment # noqa + + knl = lp.make_kernel( + "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", + [ + Instr(f[I], l_sum(k, q_v[k, I]*u)), + Instr(df[I], l_sum(k, q_v[k, I])), + ], + [ + lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), + lp.GlobalArg("f,df", np.float64, shape="nSpace"), + lp.ValueArg("u", np.float64), + "...", + ], + target=CTarget(), + assumptions="nQuad>=1") + + if 0: # enable to play with prefetching + # (prefetch currently requires constant sizes) + knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) + knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) + + knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) + knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + print(lp.generate_body(knl)) + + +@pytest.mark.parametrize("tp", ["f32", "f64"]) +def test_random123(ctx_factory, tp): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + import pyopencl.version # noqa + if cl.version.VERSION < (2016, 2): + pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") + + n = 150000 + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + <> key2 = make_uint2(i, 324830944) {inames=i} + <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr} + <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr} + + out[i, 0] = real.s0 + 1j * imag.s0 + out[i, 1] = real.s1 + 1j * imag.s1 + out[i, 2] = real.s2 + 1j * imag.s2 + out[i, 3] = real.s3 + 1j * imag.s3 + """.replace("TYPE", tp)) + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, n=n) + + out = out.get() + assert (out < 1).all() + assert (0 <= out).all() + + +def test_clamp(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 15 * 10**6 + x = cl.clrandom.rand(queue, n, dtype=np.float32) + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = clamp(x[i], a, b)") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..cf9564274d12b2889d5010bbd807c4bd94fa149f --- /dev/null +++ b/test/test_transform.py @@ -0,0 +1,412 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_chunk_iname(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + assumptions="n>0") + + ref_knl = knl + knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "i_outer, i_inner") + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) + + +def test_collect_common_factors(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j<n}", + """ + <float32> out_tmp = 0 {id=out_init,inames=i} + out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} + out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init} + out[i] = out_tmp {dep=out_up1:out_up2} + """) + knl = lp.add_and_infer_dtypes(knl, + dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32)) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") + knl = lp.collect_common_factors_on_increment(knl, "out_tmp") + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) + + +def test_to_batched(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + ''' out[i] = sum(j, a[i,j]*x[j])''') + + bknl = lp.to_batched(knl, "nbatches", "out,x") + + a = np.random.randn(5, 5) + x = np.random.randn(7, 5) + + bknl(queue, a=a, x=x) + + +def test_rename_argument(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + kernel = lp.make_kernel( + '''{ [i]: 0<=i<n }''', + '''out[i] = a + 2''') + + kernel = lp.rename_argument(kernel, "a", "b") + + evt, (out,) = kernel(queue, b=np.float32(12), n=20) + + assert (np.abs(out.get() - 14) < 1e-8).all() + + +def test_fusion(): + exp_kernel = lp.make_kernel( + ''' { [i]: 0<=i<n } ''', + ''' exp[i] = pow(E, z[i])''', + assumptions="n>0") + + sum_kernel = lp.make_kernel( + '{ [j]: 0<=j<n }', + 'out2 = sum(j, exp[j])', + assumptions='n>0') + + knl = lp.fuse_kernels([exp_kernel, sum_kernel]) + + print(knl) + + +def test_alias_temporaries(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + times2(i) := 2*a[i] + times3(i) := 3*a[i] + times4(i) := 4*a[i] + + x[i] = times2(i) + y[i] = times3(i) + z[i] = times4(i) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + + knl = lp.precompute(knl, "times2", "i_inner") + knl = lp.precompute(knl, "times3", "i_inner") + knl = lp.precompute(knl, "times4", "i_inner") + + knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + + +def test_vectorize(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + <> temp = 2*b[i] + a[i] = temp + """) + knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) + knl = lp.set_array_dim_names(knl, "a,b", "i") + knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, + split_kwargs=dict(slabs=(0, 1))) + + knl = lp.tag_data_axes(knl, "a,b", "c,vec") + ref_knl = knl + ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) + + knl = lp.tag_inames(knl, {"i_inner": "vec"}) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + code, inf = lp.generate_code(knl) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + + +def test_extract_subst(ctx_factory): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + a[i] = 23*b[i]**2 + 25*b[i]**2 + """) + + knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha") + + print(knl) + + from loopy.symbolic import parse + + insn, = knl.instructions + assert insn.expression == parse("bsquare(23) + bsquare(25)") + + +def test_join_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<16}", + [ + "b[i,j] = 2*a[i,j]" + ], + [ + lp.GlobalArg("a", np.float32, shape=(16, 16,)), + lp.GlobalArg("b", np.float32, shape=(16, 16,)) + ], + ) + + ref_knl = knl + + knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) + knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) + + +def test_tag_data_axes(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i,j,k<n }", + "out[i,j,k] = 15") + + ref_knl = knl + + with pytest.raises(lp.LoopyError): + lp.tag_data_axes(knl, "out", "N1,N0,N5") + + with pytest.raises(lp.LoopyError): + lp.tag_data_axes(knl, "out", "N1,N0,c") + + knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") + knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters=dict(n=20)) + + +def test_set_arg_order(): + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<n }", + "out[i,j] = a[i]*b[j]") + + knl = lp.set_argument_order(knl, "out,a,n,b") + + +def test_affine_map_inames(): + knl = lp.make_kernel( + "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}", + "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]") + + knl = lp.affine_map_inames(knl, + "i", "i0", + "i0 = n+i") + + print(knl) + + +def test_precompute_confusing_subst_arguments(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + D(i):=a[i+1]-a[i] + b[i,j] = D(j) + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + +def test_precompute_nested_subst(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + E:=a[i] + D:=E*E + b[i] = D + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D", "i_inner") + + # There's only one surviving 'E' rule. + assert len([ + rule_name + for rule_name in knl.substitutions + if rule_name.startswith("E")]) == 1 + + # That rule should use the newly created prefetch inames, + # not the prior 'i_inner' + assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + +def test_precompute_with_preexisting_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + ref_knl = knl + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(E=200)) + + +def test_precompute_with_preexisting_inames_fail(): + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + with pytest.raises(lp.LoopyError): + lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker