__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import logging import numpy as np import pytest import pyopencl as cl import pyopencl.array import pyopencl.clmath import pyopencl.clrandom from pyopencl.tools import ( # noqa: F401 pytest_generate_tests_for_pyopencl as pytest_generate_tests, ) import loopy as lp from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 logger = logging.getLogger(__name__) def test_globals_decl_once_with_multi_subprogram(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) rng = np.random.default_rng(seed=17) a = rng.normal(size=16) cnst = rng.normal(size=16) knl = lp.make_kernel( "{[i, ii]: 0<=i, ii id:h and tag:two > id:g and tag:two") print(knl) sr_keys = list(knl["loopy_kernel"].substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), ("h", 2) ]: substs_with_letter = sum(1 for k in sr_keys if k.startswith(letter)) assert substs_with_letter == how_many def test_type_inference_no_artificial_doubles(): prog = lp.make_kernel( "{[i]: 0<=i bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1", target=lp.PyOpenCLTarget()) code = lp.generate_code_v2(prog).device_code() assert "double" not in code def test_type_inference_with_type_dependencies(): prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 a = a + 1 <>b = 0 <>c = 1 b = b + c + 1.0 c = b + c <>d = b + 2 + 1j """, "...") prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type assert prog["loopy_kernel"].temporary_variables["a"].dtype == to_loopy_type( np.int32) assert prog["loopy_kernel"].temporary_variables["b"].dtype == to_loopy_type( np.float32) assert prog["loopy_kernel"].temporary_variables["c"].dtype == to_loopy_type( np.float32) assert prog["loopy_kernel"].temporary_variables["d"].dtype == to_loopy_type( np.complex128) def test_sized_and_complex_literals(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i aa = 5jf <> bb = 5j a[i] = imag(aa) b[i] = imag(bb) c[i] = 5f """, [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1") lp.auto_test_vs_ref(knl, ctx, knl, parameters={"n": 5}) def test_simple_side_effect(): knl = lp.make_kernel( "{[i]: 0<=i<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100,))], target=lp.PyOpenCLTarget() ) print(knl) print(lp.generate_code_v2(knl)) def test_owed_barriers(): knl = lp.make_kernel( "{[i]: 0<=i<100}", [ " z[i] = a[i]" ], [lp.GlobalArg("a", np.float32, shape=(100,))], target=lp.PyOpenCLTarget() ) knl = lp.tag_inames(knl, {"i": "l.0"}) print(knl) print(lp.generate_code_v2(knl)) def test_multi_cse(): knl = lp.make_kernel( "{[i]: 0<=i<100}", [ " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], target=lp.PyOpenCLTarget()) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) print(knl) print(lp.generate_code_v2(knl)) def test_bare_data_dependency(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "[znirp] -> {[i]: 0<=i znirp = n", "a[i] = 1", ], [ lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), ]) n = 20000 _evt, (a,) = knl(queue, n=n, out_host=True) assert a.shape == (n,) assert (a == 1).all() # {{{ test race detection def test_ilp_write_race_detection_global(): knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", ], [], target=lp.PyOpenCLTarget(), name="loopy_kernel") knl = lp.tag_inames(knl, {"i": "l.0", "j": "ilp"}) knl = lp.preprocess_kernel(knl) assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(): knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], [], target=lp.PyOpenCLTarget(), name="loopy_kernel") knl = lp.tag_inames(knl, {"j": "ilp"}) knl = lp.preprocess_kernel(knl) assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,) # }}} def test_write_parameter(dtype=np.float32): knl = lp.make_kernel( "{[i,j]: 0<=i,j gid = i/256 start = gid*256 for j a[start + j] = a[start + j] + j end end """, seq_dependencies=True, name="uniform_l", target=lp.PyOpenCLTarget(), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code()) # }}} def test_nonlinear_index(): knl = lp.make_kernel( "{[i]: 0<=i src_ibox = source_boxes[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> strength = strengths[isrc] {id=set_strength} """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("strengths", None, shape="nsources"), "..."], target=lp.PyOpenCLTarget(), name="loopy_kernel") print(prog) assert "isrc_box" in prog["loopy_kernel"].insn_inames("set_strength") prog = lp.add_dtypes(prog, { "source_boxes": np.int32, "box_source_starts": np.int32, "box_source_counts_nonchild": np.int32, "strengths": np.float64, "nsources": np.int32}) print(lp.generate_code_v2(prog).device_code()) def test_inames_deps_from_write_subscript(ctx_factory): prog = lp.make_kernel( "{[i,j]: 0<=i,j src_ibox = source_boxes[i] something = 5 a[src_ibox] = sum(j, something) {id=myred} """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), "..."], name="loopy_kernel") print(prog) assert "i" in prog["loopy_kernel"].insn_inames("myred") def test_modulo_indexing(): knl = lp.make_kernel( "{[i,j]: 0<=i my_a = a[i,j] {id=read_a} <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j} my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero} my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero} out[i,j] = 2*my_a {dep=aplus} """, [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) ref_knl = knl lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={ "n": 200 }) def test_conditional_two_ways(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i,j b = i > 3 <> c = i > 1 out[i] = a[i] {id=init} if b out[i] = 2*a[i] {if=c,dep=init} end """, [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ] ) ref_knl = lp.make_kernel( "{ [i,j]: 0<=i,j b = i > 3 <> c = i > 1 out[i] = a[i] {id=init} if b and c out[i] = 2*a[i] {dep=init} end """, [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ] ) ref_knl = knl lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={ "n": 200 }) def test_ilp_loop_bound(ctx_factory): # The salient bit of this test is that a joint bound on (outer, inner) # from a split occurs in a setting where the inner loop has been ilp'ed. # In 'normal' parallel loops, the inner index is available for conditionals # throughout. In ILP'd loops, not so much. ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k temp[i, 0] = 17 temp[i, 1] = 15 """) knl = lp.tag_inames(knl, {"i": "l.0"}) print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) intermediate_format = "f,f,sep" rng = np.random.default_rng(seed=42) a1 = rng.normal(size=(1024, 4, 3)) cknl1 = lp.make_copy_kernel(intermediate_format) cknl1 = lp.fix_parameters(cknl1, n2=3) cknl1 = lp.set_options(cknl1, write_code=True) _evt, a2 = cknl1(queue, input=a1) cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) cknl2 = lp.fix_parameters(cknl2, n2=3) _evt, a3 = cknl2(queue, input=a2) assert (a1 == a3).all() def test_make_copy_kernel_with_offsets(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) rng = np.random.default_rng(seed=42) a1 = rng.normal(size=(3, 1024, 4)) a1_dev = cl.array.to_device(queue, a1) cknl1 = lp.make_copy_kernel("c,c,c", "sep,c,c") cknl1 = lp.fix_parameters(cknl1, n0=3) cknl1 = lp.set_options(cknl1, write_code=True) _evt, (a2_dev,) = cknl1(queue, input=a1_dev) assert (a1 == a2_dev.get()).all() def test_auto_test_can_detect_problems(ctx_factory): ctx = ctx_factory() ref_knl = lp.make_kernel( "{[i,j]: 0<=i,jt = i ... gbarrier out[i] = t end """, seq_dependencies=True) if hw_loop: prog = lp.tag_inames(prog, {"i": "g.0"}) save_and_reload_temporaries_test(queue, prog, np.arange(8), debug) def test_save_of_private_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i]: 0<=i<8 }", """ for i <>t[i] = i ... gbarrier out[i] = t[i] end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t", "private") save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<8 }", """ for i for j <>t[j] = j end ... gbarrier for k out[i,k] = t[k] end end """, seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "g.0"}) knl = lp.set_temporary_address_space(knl, "t", "private") save_and_reload_temporaries_test( queue, knl, np.vstack(8 * (np.arange(8),)), debug) def test_save_of_private_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", """ for i for j, k <>t[j,k] = k end ... gbarrier for l, m out[i,l,m] = t[l,m] end end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t", "private") result = np.array([np.vstack(8 * (np.arange(8),)) for i in range(8)]) save_and_reload_temporaries_test(queue, knl, result, debug) def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", """ for i for j, k <>t[j,k] = k end ... gbarrier for l, m out[i,l,m] = t[l,m] end end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t", "private") knl = lp.tag_inames(knl, {"i": "g.0"}) result = np.array([np.vstack(8 * (np.arange(8),)) for i in range(8)]) save_and_reload_temporaries_test(queue, knl, result, debug) @pytest.mark.parametrize("hw_loop", [True, False]) def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<10 }", """ for i for k <> t_arr[k] = k end <> t_scalar = 1 for j ... gbarrier out[j] = t_scalar ... gbarrier t_scalar = 10 end ... gbarrier <> flag = i == 9 out[i] = t_arr[i] {if=flag} end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t_arr", "private") if hw_loop: knl = lp.tag_inames(knl, {"i": "g.0"}) result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) save_and_reload_temporaries_test(queue, knl, result, debug) def test_save_of_local_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j]: 0<=i,j<8 }", """ for i, j <>t[2*j] = j t[2*j+1] = j ... gbarrier out[i] = t[2*i] end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t", "local") knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"}) save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_of_local_array_with_explicit_local_barrier(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j]: 0<=i,j<8 }", """ for i, j <>t[2*j] = j ... lbarrier t[2*j+1] = t[2*j] ... gbarrier out[i] = t[2*i] end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t", "local") knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"}) save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_local_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}", """ for i, j, k ... gbarrier <> t_local[k,j] = 1 ... gbarrier out[k,i*2+j] = t_local[k,j] end """, seq_dependencies=True) knl = lp.set_temporary_address_space(knl, "t_local", "local") knl = lp.tag_inames(knl, {"j": "l.0", "i": "g.0"}) save_and_reload_temporaries_test(queue, knl, 1, debug) def test_save_with_base_storage(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a[i] = 0 <>b[i] = i ... gbarrier out[i] = a[i] """, "...", seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.set_temporary_address_space(knl, "a", "local") knl = lp.set_temporary_address_space(knl, "b", "local") knl = lp.alias_temporaries(knl, ["a", "b"], synchronize_for_exclusive_use=False) knl = lp.preprocess_kernel(knl) knl = lp.allocate_temporaries_for_base_storage(knl) save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) def test_save_ambiguous_storage_requirements(): knl = lp.make_kernel( "{[i,j]: 0 <= i < 10 and 0 <= j < 10}", """ <>a[j] = j ... gbarrier out[i,j] = a[j] """, seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"}) knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"}) knl = lp.set_temporary_address_space(knl, "a", "local") from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): lp.save_and_reload_temporaries(knl) def test_save_across_inames_with_same_tag(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a[i] = i ... gbarrier out[i] = a[i] """, "...", seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"}) save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) def test_missing_temporary_definition_detection(): knl = lp.make_kernel( "{ [i]: 0<=i<10 }", """ for i <> t = 1 ... gbarrier out[i] = t end """, seq_dependencies=True) from loopy.diagnostic import MissingDefinitionError with pytest.raises(MissingDefinitionError): lp.generate_code_v2(knl) def test_missing_definition_check_respects_aliases(): # Based on https://github.com/inducer/loopy/issues/69 knl = lp.make_kernel("{ [i] : 0<=i c[i] = a[i + 1] ... gbarrier out[i] = c[i] end """, seq_dependencies=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) knl = lp.set_temporary_address_space(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) # print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5}) def test_assign_to_linear_subscript(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl1 = lp.make_kernel( "{ [i]: 0<=i aa = 5jf <> bb = 5j a[i] = imag(aa) b[i] = imag(bb) c[i] = 5f end """, seq_dependencies=True) print(prog["loopy_kernel"].stringify(with_dependencies=True)) lp.auto_test_vs_ref(prog, ctx, prog, parameters={"n": 5}) def test_nop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,itrip]: 0<=i z[i] = z[i+1] + z[i] {id=wr_z} <> v[i] = 11 {id=wr_v} ... nop {dep=wr_z:wr_v,id=yoink} z[i] = z[i] - z[i+1] + v[i] {dep=yoink} end """) print(knl) knl = lp.fix_parameters(knl, n=15) knl = lp.add_and_infer_dtypes(knl, {"z": np.float64}) lp.auto_test_vs_ref(knl, ctx, knl, parameters={"ntrips": 5}) def test_global_barrier(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,itrip]: 0<=i z[i] = z[i+1] + z[i] {id=wr_z,dep=top} <> v[i] = 11 {id=wr_v,dep=top} ... gbarrier {id=yoink,dep=wr_z:wr_v} z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} end ... gbarrier {dep=iupd,id=postloop} z[i] = z[i] - z[i+1] + v[i] {dep=postloop} end """) knl = lp.fix_parameters(knl, ntrips=3) knl = lp.add_and_infer_dtypes(knl, {"z": np.float64}) ref_knl = knl ref_knl = lp.set_temporary_address_space(ref_knl, "z", "global") ref_knl = lp.set_temporary_address_space(ref_knl, "v", "global") knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") print(knl) knl = lp.preprocess_kernel(knl) assert ( knl["loopy_kernel"].temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL) assert ( knl["loopy_kernel"].temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL) print(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"ntrips": 5, "n": 10}) def test_missing_global_barrier(): knl = lp.make_kernel( "{[i,itrip]: 0<=i z[i] = z[i] - z[i+1] {id=iupd,dep=yoink} end # This is where the barrier should be z[i] = z[i] - z[i+1] + v[i] {dep=iupd} end """) knl = lp.set_temporary_address_space(knl, "z", "global") knl = lp.split_iname(knl, "i", 256, outer_tag="g.0") knl = lp.add_dtypes(knl, {"z": np.float32, "v": np.float32}) knl = lp.preprocess_kernel(knl) from loopy.diagnostic import MissingBarrierError with pytest.raises(MissingBarrierError): lp.generate_code_v2(knl) def test_index_cse(ctx_factory): knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m Tcond = T[k] < 0.5 if Tcond cp[k] = 2 * T[k] + Tcond end end """) knl = lp.fix_parameters(knl, n=200) knl = lp.add_and_infer_dtypes(knl, {"T": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "k", 2, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl) @pytest.mark.parametrize("unr_tag", ["unr", "unr_hint"]) def test_unr_and_conditionals(ctx_factory, unr_tag): ctx = ctx_factory() knl = lp.make_kernel("{[k]: 0<=k Tcond[k] = T[k] < 0.5 if Tcond[k] cp[k] = 2 * T[k] + Tcond[k] end end """) knl = lp.fix_parameters(knl, n=200) knl = lp.add_and_infer_dtypes(knl, {"T": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "k", 2, inner_tag=unr_tag) lp.auto_test_vs_ref(ref_knl, ctx, knl) def test_constant_array_args(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[k]: 0<=k Tcond[k] = T[k] < 0.5 if Tcond[k] cp[k] = 2 * T[k] + Tcond[k] end end """, [lp.ConstantArg("T", shape=(200,), dtype=np.float32), "..."]) knl = lp.fix_parameters(knl, n=200) lp.auto_test_vs_ref(knl, ctx, knl) @pytest.mark.parametrize("src_order", ["C"]) @pytest.mark.parametrize("tmp_order", ["C", "F"]) def test_temp_initializer(ctx_factory, src_order, tmp_order): rng = np.random.default_rng(seed=42) a = rng.normal(size=(3, 3)).copy(order=src_order) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j {[]: }")], # empty (domain w/unused inames errors) "a = 1", [ lp.TemporaryVariable("a", dtype=np.float64, shape=(), base_storage="base"), lp.TemporaryVariable("b", dtype=np.float64, shape=(), base_storage="base"), ]) knl = lp.preprocess_kernel(knl) knl = lp.allocate_temporaries_for_base_storage(knl) knl(queue, out_host=True) def test_if_else(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i]: 0<=i<50}", """ if i % 3 == 0 a[i] = 15 {nosync_query=writes:a} elif i % 3 == 1 a[i] = 11 {nosync_query=writes:a} else a[i] = 3 {nosync_query=writes:a} end """ ) _evt, (out,) = knl(queue, out_host=True) out_ref = np.empty(50) out_ref[::3] = 15 out_ref[1::3] = 11 out_ref[2::3] = 3 assert np.array_equal(out_ref, out) knl = lp.make_kernel( "{ [i]: 0<=i<50}", """ for i if i % 2 == 0 if i % 3 == 0 a[i] = 15 {nosync_query=writes:a} elif i % 3 == 1 a[i] = 11 {nosync_query=writes:a} else a[i] = 3 {nosync_query=writes:a} end else a[i] = 4 {nosync_query=writes:a} end end """ ) _evt, (out,) = knl(queue, out_host=True) out_ref = np.zeros(50) out_ref[1::2] = 4 out_ref[0::6] = 15 out_ref[4::6] = 11 out_ref[2::6] = 3 knl = lp.make_kernel( "{ [i,j]: 0<=i,j<50}", """ for i if i < 25 for j if j % 2 == 0 a[i, j] = 1 {nosync_query=writes:a} else a[i, j] = 0 {nosync_query=writes:a} end end else for j if j % 2 == 0 a[i, j] = 0 {nosync_query=writes:a} else a[i, j] = 1 {nosync_query=writes:a} end end end end """ ) _evt, (out,) = knl(queue, out_host=True) out_ref = np.zeros((50, 50)) out_ref[:25, 0::2] = 1 out_ref[25:, 1::2] = 1 assert np.array_equal(out_ref, out) def test_tight_loop_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (queue.device.platform.vendor == "Intel(R) Corporation" and queue.device.driver_version in [ "2019.8.7.0", "2019.8.8.0", ]): pytest.skip("Intel CL miscompiles this kernel") knl = lp.make_kernel( ["{ [i] : 0 <= i <= 5 }", "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"], """ for i for j out[j] = j end end """, silenced_warnings="write_race(insn)") knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") knl = lp.set_options(knl, write_code=True) _evt, (out,) = knl(queue, out_host=True) assert (out == np.arange(10)).all() def test_tight_loop_bounds_codegen(): knl = lp.make_kernel( ["{ [i] : 0 <= i <= 5 }", "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"], """ for i for j out[j] = j end end """, silenced_warnings="write_race(insn)", target=lp.OpenCLTarget()) knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") cgr = lp.generate_code_v2(knl) # print(cgr.device_code()) for_loop = \ "for (int j = " \ "((gid(0) == 0 && lid(0) == 0) ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \ "j <= ((-1 + gid(0) == 0 && lid(0) == 0) ? 9 : 2 * lid(0)); ++j)" assert for_loop in cgr.device_code() def test_unscheduled_insn_detection(): prog = lp.make_kernel( "{ [i]: 0 <= i < 10 }", """ out[i] = i {id=insn1} """, "...") prog = lp.preprocess_kernel(prog) prog = lp.linearize(prog) insn1, = lp.find_instructions(prog, "id:insn1") insns = prog["loopy_kernel"].instructions[:] insns.append(insn1.copy(id="insn2", depends_on=frozenset({"insn1"}))) prog = prog.with_kernel(prog["loopy_kernel"].copy(instructions=insns)) from loopy.diagnostic import UnscheduledInstructionError with pytest.raises(UnscheduledInstructionError): lp.generate_code(prog) def test_integer_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) rng = np.random.default_rng(seed=42) from loopy.types import to_loopy_type n = 200 for vtype in [np.int32, np.int64]: var_int = rng.integers(1000, size=n, dtype=vtype) var_lp = lp.TemporaryVariable("var", initializer=var_int, read_only=True, address_space=lp.AddressSpace.PRIVATE, dtype=to_loopy_type(vtype), shape=lp.auto) from collections import namedtuple ReductionTest = namedtuple("ReductionTest", "kind, check, args") reductions = [ ReductionTest("max", lambda x: x == np.max(var_int), args="var[k]"), ReductionTest("min", lambda x: x == np.min(var_int), args="var[k]"), ReductionTest("sum", lambda x: x == np.sum(var_int), args="var[k]"), ReductionTest("product", lambda x: x == np.prod(var_int), args="var[k]"), ReductionTest("argmax", lambda x: ( x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)), args="var[k], k"), ReductionTest("argmin", lambda x: ( x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)), args="var[k], k") ] for reduction, function, args in reductions: kstr = ("out" if "arg" not in reduction else "out[0], out[1]") kstr += f" = {reduction}(k, {args})" knl = lp.make_kernel("{[k]: 0<=k dist_sq = sum(idim, (tgt[idim,itgt] - center[idim,ictr])**2) <> in_disk = dist_sq < (radius[ictr]*1.05)**2 <> matches = ( (in_disk and qbx_forced_limit == 0) or (in_disk and qbx_forced_limit != 0 and qbx_forced_limit * center_side[ictr] > 0) ) <> post_dist_sq = dist_sq if matches else HUGE end <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq) tgt_to_qbx_center[itgt] = min_ictr if min_dist_sq < HUGE else -1 end """) knl = lp.fix_parameters(knl, ambient_dim=2) knl = lp.add_and_infer_dtypes(knl, { "tgt,center,radius,HUGE": np.float32, "center_side,qbx_forced_limit": np.int32, }) lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={ "HUGE": 1e20, "ncenters": 200, "ntargets": 300, "qbx_forced_limit": 1}) def test_nosync_option_parsing(): knl = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>t = 1 {id=insn1,nosync=insn1} t = 2 {id=insn2,nosync=insn1:insn2} t = 3 {id=insn3,nosync=insn1@local:insn2@global:insn3@any} t = 4 {id=insn4,nosync_query=id:insn*@local} t = 5 {id=insn5,nosync_query=id:insn1} """, options=lp.Options(allow_terminal_colors=False)) kernel_str = str(knl) print(kernel_str) assert "id=insn1, no_sync_with=insn1@any" in kernel_str assert "id=insn2, no_sync_with=insn1@any:insn2@any" in kernel_str assert "id=insn3, no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str assert "id=insn4, no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa: E501 assert "id=insn5, no_sync_with=insn1@any" in kernel_str def barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): from loopy.schedule import ( Barrier, CallKernel, EnterLoop, LeaveLoop, ReturnFromKernel, RunInstruction, ) watch_for_barrier = False seen_barrier = False loop_level = 0 for sched_item in knl.linearization: if isinstance(sched_item, RunInstruction): if sched_item.insn_id == id1: watch_for_barrier = True elif sched_item.insn_id == id2: return watch_for_barrier and seen_barrier elif isinstance(sched_item, Barrier): if watch_for_barrier and loop_level not in ignore_barriers_in_levels: seen_barrier = True elif isinstance(sched_item, EnterLoop): loop_level += 1 elif isinstance(sched_item, LeaveLoop): loop_level -= 1 elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): pass else: raise RuntimeError("schedule item type '%s' not understood" % type(sched_item).__name__) raise RuntimeError("id2 was not seen") def test_barrier_insertion_near_top_of_loop(): prog = lp.make_kernel( "{[i,j]: 0 <= i,j < 10 }", """ for i <>a[i] = i {id=ainit} for j <>t = a[(i + 1) % 10] {id=tcomp} <>b[i,j] = a[i] + t {id=bcomp1} b[i,j] = b[i,j] + 1 {id=bcomp2} end end """, seq_dependencies=True) prog = lp.tag_inames(prog, {"i": "l.0"}) prog = lp.set_temporary_address_space(prog, "a", "local") prog = lp.set_temporary_address_space(prog, "b", "local") prog = lp.preprocess_kernel(prog) knl = lp.get_one_linearized_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) assert barrier_between(knl, "ainit", "tcomp") def test_barrier_insertion_near_bottom_of_loop(): prog = lp.make_kernel( ["{[i]: 0 <= i < 10 }", "[jmax] -> {[j]: 0 <= j < jmax}"], """ for i <>a[i] = i {id=ainit} for j <>b[i,j] = a[(i+1) % 10] + t {id=bcomp1} b[i,j] = b[(i+2) % 10,j] + 1 {id=bcomp2} end a[10-i] = i + 1 {id=aupdate} end """, seq_dependencies=True) prog = lp.tag_inames(prog, {"i": "l.0"}) prog = lp.set_temporary_address_space(prog, "a", "local") prog = lp.set_temporary_address_space(prog, "b", "local") prog = lp.preprocess_kernel(prog) knl = lp.get_one_linearized_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) assert barrier_between(knl, "bcomp1", "bcomp2") assert barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ for i a[i] = i {id=a} ... lbarrier {id=barrier} b[i + 1] = a[i] {nosync=a} end """, [lp.TemporaryVariable("a", np.float32, shape=(10,), order="C", address_space=lp.AddressSpace.LOCAL), lp.GlobalArg("b", np.float32, shape=(11,), order="C")], seq_dependencies=True) # split into kernel w/ vesize larger than iname domain vecsize = 16 prog = lp.split_iname(prog, "i", vecsize, inner_tag="l.0") from testlib import GridOverride # artificially expand via overridden_get_grid_sizes_for_insn_ids knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) prog = prog.with_kernel(knl) # make sure we can generate the code lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): from loopy.library.reduction import SegmentedSumReductionOperation from loopy.type_inference import TypeReader from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() prog = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j z[i] = z[i+1] + z[i] {id=wr_z,dep=top} <> v[i] = 11 {id=wr_v,dep=top} ... gbarrier {dep=wr_z:wr_v,id=yoink} z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} end ... nop {id=nop} ... gbarrier {dep=iupd,id=postloop} z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop} end """) assert (lp.get_global_barrier_order(prog["loopy_kernel"]) == ("top", "yoink", "postloop")) for insn, barrier in ( ("nop", None), ("top", None), ("wr_z", "top"), ("wr_v", "top"), ("yoink", "top"), ("postloop", "yoink"), ("zzzv", "postloop")): assert lp.find_most_recent_global_barrier(prog["loopy_kernel"], insn) == barrier def test_global_barrier_error_if_unordered(): # FIXME: Should be illegal to declare this prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ ... gbarrier ... gbarrier """) from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): lp.get_global_barrier_order(prog["loopy_kernel"]) def test_struct_assignment(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) bbhit = np.dtype([ ("tmin", np.float32), ("tmax", np.float32), ("bi", np.int32), ("hit", np.int32)]) bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct( ctx.devices[0], "bbhit", bbhit) bbhit = cl.tools.get_or_register_dtype("bbhit", bbhit) preamble = bbhit_c_decl knl = lp.make_kernel( "{ [i]: 0<=itmp1 = 0 end for j ... gbarrier <>tmp2 = i end """, "...", seq_dependencies=True) knl = lp.tag_inames(knl, {"i": "g.0"}) with cl.CommandQueue(ctx) as queue: knl(queue) def test_fixed_parameters(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i]: 0 <= i < n}", """ <>tmp[i] = i {id=init} tmp[0] = 0 {dep=init} """, fixed_parameters={"n": 1}) knl(queue) def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") assert knl["loopy_kernel"].all_params() == {"n"} def test_execution_backend_can_cache_dtypes(ctx_factory): # When the kernel is invoked, the execution backend uses it as a cache key # for the type inference and scheduling cache. This tests to make sure that # dtypes in the kernel can be cached, even though they may not have a # target. ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i") knl = lp.add_dtypes(knl, {"tmp": int}) knl(queue) def test_wildcard_dep_matching(): prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} <>b = 0 {id=insn2,dep=insn?} <>c = 0 {id=insn3,dep=insn*} <>d = 0 {id=insn4,dep=insn[12]} <>e = 0 {id=insn5,dep=insn[!1]} """, "...") all_insns = {"insn%d" % i for i in range(1, 6)} assert prog["loopy_kernel"].id_to_insn["insn1"].depends_on == set() assert (prog["loopy_kernel"].id_to_insn["insn2"].depends_on == all_insns - {"insn2"}) assert (prog["loopy_kernel"].id_to_insn["insn3"].depends_on == all_insns - {"insn3"}) assert (prog["loopy_kernel"].id_to_insn["insn4"].depends_on == {"insn1", "insn2"}) assert (prog["loopy_kernel"].id_to_insn["insn5"].depends_on == all_insns - {"insn1", "insn5"}) def test_arg_inference_for_predicates(): prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ if incr[i] a = a + 1 end """, name="loopy_kernel") knl = prog["loopy_kernel"] assert "incr" in knl.arg_dict assert knl.arg_dict["incr"].shape == (10,) def test_relaxed_stride_checks(ctx_factory): # Check that loopy is compatible with numpy's relaxed stride rules. ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0 <= i <= n and 0 <= j <= m}", """ a[i] = sum(j, A[i,j] * b[j]) """) with cl.CommandQueue(ctx) as queue: mat = np.zeros((1, 10), order="F") b = np.zeros(10) _evt, (a,) = knl(queue, A=mat, b=b) assert a == 0 def test_add_prefetch_works_in_lhs_index(): prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=n a1_tmp[k,l] = a1[a1_map[n, k],l] a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1 a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2] end """, [ lp.GlobalArg("a1,a1_out", None, "ndofs,2"), lp.GlobalArg("a1_map", None, "nelements,3"), "..." ]) prog = lp.add_prefetch(prog, "a1_map", "k", default_tag="l.auto") from loopy.symbolic import get_dependencies for insn in prog["loopy_kernel"].instructions: assert "a1_map" not in get_dependencies(insn.assignees) def test_check_for_variable_access_ordering(): knl = lp.make_kernel( "{[i]: 0<=i nu = i - 4 if nu > 0 <> P_val = a[i, j] {id=pset0} else P_val = 0.1 * a[i, j] {id=pset1} end <> B_sum = 0 for k B_sum = B_sum + k * P_val {id=bset, dep=pset*} end # here, we are testing that Kc is properly promoted to a vector dtype <> Kc = P_val * B_sum {id=kset, dep=bset} a[i, j] = Kc {dep=kset} end end """, [lp.GlobalArg("a", shape=(12, 12), dtype=np.int32)]) knl = lp.split_iname(knl, "j", 4, inner_tag="vec") knl = lp.split_array_axis(knl, "a", 1, 4) knl = lp.tag_array_axes(knl, "a", "N1,N0,vec") knl = lp.preprocess_kernel(knl) from loopy.diagnostic import DependencyCycleFound with pytest.raises(DependencyCycleFound): print(lp.generate_code_v2(knl).device_code()) def test_backwards_dep_printing_and_error(): knl = lp.make_kernel( "{[i]: 0<=i icontaining_tgt_box = 1 {id=flagset} if icontaining_tgt_box == 1 result = result + simul_reduce(sum, i, i*i) result = result + simul_reduce(sum, i, 2*i*i) end """, name="lpy_knl") ppknl = lp.preprocess_kernel(knl) # accumulator initializers must be dependency-less assert all(not insn.depends_on for insn in ppknl["lpy_knl"].instructions if "init" in insn.id) # accumulator initializers must not have inherited the predicates assert all(not insn.predicates for insn in ppknl["lpy_knl"].instructions if "init" in insn.id) # Ensure valid linearization exists: No valid linearization unless the # accumulator initializers can move out of the loop. print(lp.generate_code_v2(ppknl).device_code()) def test_scalar_temporary(ctx_factory): from numpy.random import default_rng ctx = ctx_factory() queue = cl.CommandQueue(ctx) rng = default_rng() x_in = rng.random() knl = lp.make_kernel( "{:}", """ tmp = 2*x y = 2*tmp """, [lp.ValueArg("x", dtype=float), lp.TemporaryVariable("tmp", address_space=lp.AddressSpace.GLOBAL, shape=lp.auto), ...]) _evt, (out, ) = knl(queue, x=x_in) np.testing.assert_allclose(4*x_in, out.get()) def test_cached_written_variables_doesnt_carry_over_invalidly(): knl = lp.make_kernel( "{:}", """ a[i] = 2*i {id=write_a} b[i] = 2*i {id=write_b} """) from pickle import dumps, loads knl2 = loads(dumps(knl)) knl2 = lp.remove_instructions(knl2, {"write_b"}) assert "b" not in knl2["loopy_kernel"].get_written_variables() def test_kernel_tagging(): from pytools.tag import Tag class LessInformativeTag(Tag): pass class SuperInformativeTag(Tag): pass class SuperDuperInformativeTag(SuperInformativeTag): pass t1 = SuperInformativeTag() t2 = LessInformativeTag() knl1 = lp.make_kernel( "{:}", "y = 0", tags=frozenset((t1, t2))) knl1 = knl1.default_entrypoint assert knl1.tags == frozenset((t1, t2)) t3 = SuperDuperInformativeTag() knl2 = knl1.tagged(tags=frozenset((t3,))) assert knl2.tags == frozenset((t1, t2, t3)) knl3 = knl2.without_tags(tags=frozenset((t2,))) assert knl3.tags == frozenset((t1, t3)) assert knl3.copy().tags == knl3.tags def test_split_iname_with_multiple_dim_params(ctx_factory): ctx = ctx_factory() ref_knl = lp.make_kernel( ["{[i, j]: 0<=i,j<16}", "[i,j] -> {[k]: 0<=k<=4}"], """ foo[i, j, k] = i+j+k """) knl = lp.split_iname(ref_knl, "i", 4) lp.auto_test_vs_ref(ref_knl, ctx, knl) @pytest.mark.parametrize("opt_name", ["trace_assignments", "trace_assignment_values"]) def test_trace_assignments(ctx_factory, opt_name): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j<2}", """ foo[i,j] = i+j """) knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"}) knl = lp.set_options(knl, **{opt_name: True}) knl(queue) def test_tunit_to_python(): knl = lp.make_kernel( "{[i, j]: 0<=i,j tmp[i] = i ... gbarrier out[j] = tmp[9-j] """, seq_dependencies=True) t_unit = lp.linearize(lp.preprocess_kernel(t_unit)) ret_from_knl_idx = get_return_from_kernel_mapping(t_unit.default_entrypoint) assert ret_from_knl_idx[0] == 4 assert ret_from_knl_idx[1] == 4 assert ret_from_knl_idx[2] == 4 assert ret_from_knl_idx[3] == 4 assert ret_from_knl_idx[6] == 10 assert ret_from_knl_idx[7] == 10 assert ret_from_knl_idx[8] == 10 assert ret_from_knl_idx[9] == 10 def test_zero_stride_array(ctx_factory): ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( ["{[i]: 0<=i<10}", "{[j]: 1=0}"], """ y[i, j] = 1 """, [lp.GlobalArg("y", shape=(10, 0))]) _evt, (out,) = knl(cq) assert out.shape == (10, 0) def test_sep_array_ordering(ctx_factory): # https://github.com/inducer/loopy/pull/667 ctx = ctx_factory() cq = cl.CommandQueue(ctx) # NOTE: this works with n = 10, but fails with n >= 11 n = 11 knl = lp.make_kernel( "{[i, k]: 0<=k tmp[k] = k ** 2 y[j] = 0 if j < 5 else sum(i, tmp[i+j-5]) """, seq_dependencies=True) # if predicates are added correctly, access checker does not raise lp.auto_test_vs_ref(knl, ctx, knl) def test_redn_in_predicate(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( ["{[i]: 0<= i < 5}", "{[j]: 0<= j < 10}", "{[k]: 0<=k<10}"], """ y[j] = sum(i, i**3) if (sum(k, k**2) < 2) else (10 - j) """, seq_dependencies=True) lp.auto_test_vs_ref(knl, ctx, knl) def test_obj_tagged_is_persistent_hashable(): from pytools.tag import Tag, tag_dataclass from loopy.match import ObjTagged from loopy.tools import LoopyKeyBuilder lkb = LoopyKeyBuilder() @tag_dataclass class MyTag(Tag): pass assert lkb(ObjTagged(MyTag())) == lkb(ObjTagged(MyTag())) @pytest.mark.xfail def test_vec_loops_surrounded_by_preds(ctx_factory): # See https://github.com/inducer/loopy/issues/615 ctx = ctx_factory() knl = lp.make_kernel( "{[i, j]: 0<=i<100 and 0<=j<4}", """ for i for j if j <> tmp[j] = 1 end out[i, j] = 2*tmp[j] end end """, seq_dependencies=True) ref_knl = knl knl = lp.tag_array_axes(knl, "tmp", "vec") knl = lp.tag_inames(knl, "j:vec") lp.auto_test_vs_ref(ref_knl, ctx, knl) def test_vec_inames_can_reenter(ctx_factory): # See https://github.com/inducer/loopy/issues/644 ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i, j]: 0<=i,j<4}", """ for i <> tmp0[i] = 1 for j <> tmp1[i] = 2 end <> tmp2[i] = 3 out[i] = tmp0[i] + tmp1[i] + tmp2[i] end """, seq_dependencies=True) knl = lp.tag_inames(knl, "i:vec") knl = lp.tag_array_axes(knl, "tmp0,tmp1,tmp2", "vec") knl = lp.duplicate_inames(knl, "i", within="writes:tmp1", tags={"i": "vec"}) _, (out,) = knl(cq) np.testing.assert_allclose(out.get(), 6*np.ones(4)) def test_split_and_join_inames(ctx_factory): # See https://github.com/inducer/loopy/issues/652 ctx = ctx_factory() tunit = lp.make_kernel( "{[i]: 0<=i<16}", """ y[i] = i """) ref_tunit = tunit tunit = lp.split_iname(tunit, "i", 4) tunit = lp.join_inames(tunit, ["i_inner", "i_outer"]) lp.auto_test_vs_ref(ref_tunit, ctx, tunit) def test_different_index_dtypes(): from loopy.diagnostic import LoopyError doublify = lp.make_function( "{[i]: 0<=i<10}", """ x[i] = x[i] * 2 """, name="doublify", index_dtype=np.int64 ) knl = lp.make_kernel( "{[I]: 0<=I<10}", """ [I]: X[I] = doublify([I]: X[I]) """, index_dtype=np.int32 ) knl = lp.merge([knl, doublify]) with pytest.raises(LoopyError): lp.generate_code_v2(knl) def test_translation_unit_pickle(): tunit = lp.make_kernel( "{[i]: 0<=i<16}", """ y[i] = i """) assert isinstance(hash(tunit), int) from pickle import dumps, loads tunit = loads(dumps(tunit)) assert isinstance(hash(tunit), int) def test_creation_kwargs(): # https://github.com/inducer/loopy/issues/705 knl = lp.make_kernel( "{[i]: 0<=i<10}", "a[i] = foo() * i", substitutions={"foo": lp.SubstitutionRule("foo", (), 3.14)}, ) assert len(knl.default_entrypoint.substitutions) != 0 # https://github.com/inducer/loopy/issues/705 with pytest.raises(lp.LoopyError): lp.make_kernel( "{[i]: 0<=i<10}", """ foo := 5 a[i] = foo() * i """, substitutions={"foo": lp.SubstitutionRule("foo", (), 3.14)}, ) with pytest.raises(TypeError): knl = lp.make_kernel( "{[i]: 0<=i<10}", "a[i] = foo() * i", # not a known kwarg ksdfjlasdf=None) def test_global_temps_with_multiple_base_storages(ctx_factory): # See https://github.com/inducer/loopy/issues/737 n = 10 ctx = ctx_factory() cq = cl.CommandQueue(ctx) prg = lp.make_kernel( "{[r0, r1]: 0<=r0,r1< %s}" % n, """ tmp0 = sum(r0, r0**2) ... gbarrier tmp1 = sum(r1, r1**3) ... gbarrier out = tmp0 + tmp1 """, [lp.TemporaryVariable("tmp0", shape=lp.auto, address_space=lp.AddressSpace.GLOBAL, base_storage="base1"), lp.TemporaryVariable("tmp1", shape=lp.auto, address_space=lp.AddressSpace.GLOBAL, base_storage="base2"), ...], seq_dependencies=True ) prg = lp.infer_unknown_types(prg) prg = lp.allocate_temporaries_for_base_storage(prg) print(prg) _, (out,) = prg(cq) assert out == sum(i**2 for i in range(n)) + sum(i**3 for i in range(n)) def test_t_unit_to_python_with_substs(): t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ subst_0(i) := abs(10.0 * (i-5)) subst_1(i) := abs(10.0 * (i**2-5)) y[i] = subst_0(i) + subst_1(i) """) lp.t_unit_to_python(t_unit) # contains check to assert roundtrip equivalence def test_type_inference_of_clbls_in_substitutions(ctx_factory): # Regression for https://github.com/inducer/loopy/issues/746 ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0<=i<10}", """ subst_0(_0) := abs(10.0 * (_0-5)) y[i] = subst_0(i) """) _evt, (out,) = knl(cq) np.testing.assert_allclose(out.get(), np.abs(10.0*(np.arange(10)-5))) def test_einsum_parsing(ctx_factory): ctx = ctx_factory() # See knl = lp.make_einsum("ik, kj -> ij", ["A", "B"]) knl = lp.add_dtypes(knl, {"A": np.float32, "B": np.float32}) lp.auto_test_vs_ref(knl, ctx, knl, parameters={"Ni": 10, "Nj": 10, "Nk": 10}) def test_no_barrier_err_for_global_temps_with_base_storage(ctx_factory): # Regression for https://github.com/inducer/loopy/issues/748 ctx = ctx_factory() cq = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i, j<16}", """ for i tmp1[i] = i tmp2[i] = tmp1[i] + 2 end ... gbarrier for j out[j] = tmp1[j] + tmp2[j] end """, [lp.TemporaryVariable("tmp1", address_space=lp.AddressSpace.GLOBAL, base_storage="base1", shape=lp.auto), lp.TemporaryVariable("tmp2", address_space=lp.AddressSpace.GLOBAL, base_storage="base2", shape=lp.auto), ...], seq_dependencies=True ) knl = lp.split_iname(knl, "i", 4, inner_tag="l.0", outer_tag="g.0") knl = lp.split_iname(knl, "j", 4, inner_tag="l.0", outer_tag="g.0") knl = lp.preprocess_kernel(knl) knl = lp.allocate_temporaries_for_base_storage(knl) _, (out,) = knl(cq, out_host=True) np.testing.assert_allclose(2*np.arange(16) + 2, out) def test_dgemm_with_rectangular_tile_prefetch(): # See t_unit = lp.make_kernel( "{[i,j,k]: 0<=i,j<72 and 0<=k<32}", """ C[i,j] = sum(k, A[i,k] * B[k,j]) """, [lp.GlobalArg("A,B", dtype=np.float64, shape=lp.auto), ...], ) ref_t_unit = t_unit tx = 8 ty = 23 tk = 11 t_unit = lp.split_iname(t_unit, "i", tx, inner_tag="l.0", outer_tag="g.0") t_unit = lp.split_iname(t_unit, "j", ty, inner_tag="l.1", outer_tag="g.1") t_unit = lp.split_iname(t_unit, "k", tk) t_unit = lp.add_prefetch( t_unit, "A", sweep_inames=["i_inner", "k_inner"], temporary_address_space=lp.AddressSpace.LOCAL, fetch_outer_inames=frozenset({"i_outer", "j_outer", "k_outer"}), dim_arg_names=["iprftch_A", "kprftch_A"], default_tag=None, ) t_unit = lp.add_prefetch( t_unit, "B", sweep_inames=["k_inner", "j_inner"], temporary_address_space=lp.AddressSpace.LOCAL, fetch_outer_inames=frozenset({"i_outer", "j_outer", "k_outer"}), dim_arg_names=["kprftch_B", "jprftch_B"], default_tag=None, ) t_unit = lp.split_iname(t_unit, "kprftch_A", tx, inner_tag="l.0") t_unit = lp.split_iname(t_unit, "iprftch_A", ty, inner_tag="l.1") t_unit = lp.split_iname(t_unit, "jprftch_B", tx, inner_tag="l.0") t_unit = lp.split_iname(t_unit, "kprftch_B", ty, inner_tag="l.1") ctx = cl.create_some_context() lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) def test_modulo_vs_type_context(ctx_factory): t_unit = lp.make_kernel( "{[i]: 0 <= i < 10}", """ # previously, the float 'type context' would propagate into # the remainder, leading to 'i % 10.0' being generated, which # C/OpenCL did not like. a = i % 10 """) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) t_unit(queue) def test_barrier_non_zero_hw_lbound(): t_unit = lp.make_kernel( ["{[i]: 1<=i<17}", "{[j]: 0<=j<16}"], """ <> a[i] = i {id=w_a} <> b[j] = 2*a[j] {id=w_b} """) t_unit = lp.tag_inames(t_unit, {"i": "l.0", "j": "l.0"}) t_unit = lp.preprocess_kernel(t_unit) knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) assert barrier_between(knl, "w_a", "w_b") def test_no_unnecessary_lbarrier(ctx_factory): # This regression would fail on loopy.git <= 268a7f4 # (Issue reported by @thilinarmtb) t_unit = lp.make_kernel( "{[i_outer, i_inner]: 0 <= i_outer < n and 0 <= i_inner < 16}", """ <> s_a[i_inner] = ai[i_outer * 16 + i_inner] {id=write_s_a} ao[i_outer * 16 + i_inner] = 2.0 * s_a[i_inner] {id=write_ao, dep=write_s_a} """, assumptions="n>=0") t_unit = lp.add_dtypes(t_unit, {"ai": np.float32}) t_unit = lp.tag_inames(t_unit, {"i_inner": "l.0", "i_outer": "g.0"}) t_unit = lp.set_temporary_address_space(t_unit, "s_a", "local") t_unit = lp.prioritize_loops(t_unit, "i_outer,i_inner") t_unit = lp.preprocess_kernel(t_unit) knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) assert not barrier_between(knl, "write_s_a", "write_ao") def test_long_kernel(): n = 500 insns = [ f"a{i}[j{i}] = j{i}" for i in range(n) ] domains = [ f"{{ [j{i}]: 0<=j{i}<10 }}" for i in range(n) ] t_unit = lp.make_kernel(domains, insns) t_unit = lp.preprocess_kernel(t_unit) lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) @pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning") def test_loop_imperfect_nest_priorities_in_v2_scheduler(): # Reported by Connor Ward. See . knl = lp.make_kernel( "{ [i,j,k]: 0 <= i,j,k < 5}", """ x[i, j] = i + j y[i, k] = i + k """, loop_priority=frozenset({("i", "j"), ("i", "k")}), ) lp.generate_code_v2(knl) if __name__ == "__main__": import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: from pytest import main main([__file__]) # vim: foldmethod=marker