__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import logging import numpy as np import pytest import pyopencl as cl import pyopencl.clmath import pyopencl.clrandom from pyopencl.tools import ( # noqa: F401 pytest_generate_tests_for_pyopencl as pytest_generate_tests, ) from pytools.tag import Tag import loopy as lp from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 logger = logging.getLogger(__name__) # {{{ ContainsFloorDiv class ContainsFloorDiv(lp.symbolic.CombineMapper): def combine(self, values): return any(values) def map_floor_div(self, expr): return True def map_variable(self, expr): return False def map_constant(self, expr): return False # }}} @pytest.mark.parametrize("fix_parameters", (True, False)) def test_chunk_iname(ctx_factory, fix_parameters): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i out_tmp = 0 {id=out_init,inames=i} out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] \ {id=out_up2,dep=out_init,nosync=out_up1} out[i] = out_tmp {dep=out_up1:out_up2} """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "alpha": np.float32, "b1": np.float32, "b2": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.collect_common_factors_on_increment(knl, "out_tmp") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 13}) def test_to_batched(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) rng = np.random.default_rng(seed=42) knl = lp.make_kernel( """ { [i,j]: 0<=i,j temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, {"b": np.float32}) knl = lp.set_array_axis_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs={"slabs": (0, 1)}) knl = lp.tag_array_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) _code, _inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": 30}) def test_extract_subst(ctx_factory): prog = lp.make_kernel( "{[i]: 0<=itmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} <>tmp3[2*i] = 0 {id=insn3} <>tmp4 = 1 + tmp3[2*i] {id=insn4} <>tmp5[i] = 0 {id=insn5,groups=g1} tmp5[i] = 1 {id=insn6,conflicts=g1} """, name="nosync") orig_prog = lp.set_temporary_address_space(orig_prog, "tmp3", "local") orig_prog = lp.set_temporary_address_space(orig_prog, "tmp5", "local") # No dependency present - don't add nosync prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) assert frozenset() == ( prog["nosync"].id_to_insn["insn2"].no_sync_with) # Dependency present prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") assert frozenset() == ( prog["nosync"].id_to_insn["insn3"].no_sync_with) assert frozenset([("insn3", "local")]) == ( prog["nosync"].id_to_insn["insn4"].no_sync_with) # Bidirectional prog = lp.add_nosync( orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) assert frozenset([("insn4", "local")]) == ( prog["nosync"].id_to_insn["insn3"].no_sync_with) assert frozenset([("insn3", "local")]) == ( prog["nosync"].id_to_insn["insn4"].no_sync_with) # Groups prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") assert frozenset([("insn5", "local")]) == ( prog["nosync"].id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): i1 = lp.Assignment("b", 1, id=None) i2 = lp.Assignment("b", 1, id=None) i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) prog = lp.make_kernel("{[i]: i = 1}", [], name="lpy_knl") new_root_kernel = prog["lpy_knl"].copy(instructions=[i1, i2, i3, i4]) prog = prog.with_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids prog = uniquify_instruction_ids(prog) insn_ids = {insn.id for insn in prog["lpy_knl"].instructions} assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) def test_split_iname_only_if_in_within(): prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} """, name="splitter") prog = lp.split_iname(prog, "i", 4, within="id:to_split") for insn in prog["splitter"].instructions: if insn.id == "to_split": assert insn.within_inames == frozenset({"i_outer", "i_inner"}) if insn.id == "not_to_split": assert insn.within_inames == frozenset({"i"}) def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x b(x) := x**2 c(x) := 7 * x f[i] = c(b(a(i))) """ ) t_unit = lp.expand_subst(ref_prg) assert not any( cknl.subkernel.substitutions for cknl in t_unit.callables_table.values()) lp.auto_test_vs_ref(ref_prg, ctx, t_unit) # {{{ test_map_domain_vs_split_iname def _ensure_dim_names_match_and_align(obj_map, tgt_map): # (This function is also defined in independent, unmerged branch # new-dependency-and-nest-constraint-semantics-development, and used in # child branches thereof. Once these branches are all merged, it may make # sense to move this function to a location for more general-purpose # machinery. In the other branches, this function's name excludes the # leading underscore.) from islpy import align_spaces, dim_type as dt # first make sure names match if not all( set(obj_map.get_var_names(dt)) == set(tgt_map.get_var_names(dt)) for dt in [dt.in_, dt.out, dt.param]): raise ValueError( "Cannot align spaces; names don't match:\n%s\n%s" % (obj_map, tgt_map)) return align_spaces(obj_map, tgt_map) def test_map_domain_vs_split_iname(ctx_factory): # {{{ Make kernel knl = lp.make_kernel( [ "[nx,nt] -> {[x, t]: 0 <= x < nx and 0 <= t < nt}", "[ni] -> {[i]: 0 <= i < ni}", ], """ a[x,t] = b[x,t] {id=stmta} c[x,t] = d[x,t] {id=stmtc} e[i] = f[i] """, lang_version=(2018, 2), ) knl = lp.add_and_infer_dtypes(knl, {"b,d,f": np.float32}) ref_knl = knl # }}} # {{{ Apply domain change mapping knl_map_dom = ref_knl # Create map_domain mapping: import islpy as isl transform_map = isl.BasicMap( "[nt] -> {[t] -> [t_outer, t_inner]: " "0 <= t_inner < 32 and " "32*t_outer + t_inner = t and " "0 <= 32*t_outer + t_inner < nt}") # Call map_domain to transform kernel knl_map_dom = lp.map_domain(knl_map_dom, transform_map) # Prioritize loops (prio should eventually be updated in map_domain?) loop_priority = "x, t_outer, t_inner" knl_map_dom = lp.prioritize_loops(knl_map_dom, loop_priority) # Get a linearization proc_knl_map_dom = lp.preprocess_kernel(knl_map_dom) lin_knl_map_dom = lp.get_one_linearized_kernel( proc_knl_map_dom["loopy_kernel"], proc_knl_map_dom.callables_table) # }}} # {{{ Split iname and see if we get the same result knl_split_iname = ref_knl knl_split_iname = lp.split_iname(knl_split_iname, "t", 32) knl_split_iname = lp.prioritize_loops(knl_split_iname, loop_priority) proc_knl_split_iname = lp.preprocess_kernel(knl_split_iname) lin_knl_split_iname = lp.get_one_linearized_kernel( proc_knl_split_iname["loopy_kernel"], proc_knl_split_iname.callables_table) for d_map_domain, d_split_iname in zip( knl_map_dom["loopy_kernel"].domains, knl_split_iname["loopy_kernel"].domains): d_map_domain_aligned = _ensure_dim_names_match_and_align( d_map_domain, d_split_iname) assert d_map_domain_aligned == d_split_iname for litem_map_domain, litem_split_iname in zip( lin_knl_map_dom.linearization, lin_knl_split_iname.linearization): assert litem_map_domain == litem_split_iname # Can't easily compare instructions because equivalent subscript # expressions may have different orders lp.auto_test_vs_ref(proc_knl_split_iname, ctx_factory(), proc_knl_map_dom, parameters={"nx": 128, "nt": 128, "ni": 128}) # }}} # }}} # {{{ test_map_domain_transform_map_validity_and_errors def test_map_domain_transform_map_validity_and_errors(ctx_factory): # {{{ Make kernel knl = lp.make_kernel( [ "[nx,nt] -> {[x, y, z, t]: 0 <= x,y,z < nx and 0 <= t < nt}", "[m] -> {[j]: 0 <= j < m}", ], """ a[y,x,t,z] = b[y,x,t,z] {id=stmta} for j <>temp = j {dep=stmta} end """, lang_version=(2018, 2), ) knl = lp.add_and_infer_dtypes(knl, {"b": np.float32}) ref_knl = knl # }}} # {{{ Make sure map_domain *succeeds* when map includes 2 of 4 dims in one # domain. # {{{ Apply domain change mapping that splits t and renames y; (similar to # split_iname test above, but doesn't hurt to test this slightly different # scenario) knl_map_dom = ref_knl # Create map_domain mapping that only includes t and y # (x and z should be unaffected) import islpy as isl transform_map = isl.BasicMap( "[nx,nt] -> {[t, y] -> [t_outer, t_inner, y_new]: " "0 <= t_inner < 16 and " "16*t_outer + t_inner = t and " "0 <= 16*t_outer + t_inner < nt and " "y = y_new" "}") # Call map_domain to transform kernel; this should *not* produce an error knl_map_dom = lp.map_domain(knl_map_dom, transform_map) # Prioritize loops desired_prio = "x, t_outer, t_inner, z, y_new" knl_map_dom = lp.prioritize_loops(knl_map_dom, desired_prio) # Get a linearization proc_knl_map_dom = lp.preprocess_kernel(knl_map_dom) lin_knl_map_dom = lp.get_one_linearized_kernel( proc_knl_map_dom["loopy_kernel"], proc_knl_map_dom.callables_table) # }}} # {{{ Use split_iname and rename_iname, and make sure we get the same result knl_split_iname = ref_knl knl_split_iname = lp.split_iname(knl_split_iname, "t", 16) knl_split_iname = lp.rename_iname(knl_split_iname, "y", "y_new") knl_split_iname = lp.prioritize_loops(knl_split_iname, desired_prio) proc_knl_split_iname = lp.preprocess_kernel(knl_split_iname) lin_knl_split_iname = lp.get_one_linearized_kernel( proc_knl_split_iname["loopy_kernel"], proc_knl_split_iname.callables_table) for d_map_domain, d_split_iname in zip( knl_map_dom["loopy_kernel"].domains, knl_split_iname["loopy_kernel"].domains): d_map_domain_aligned = _ensure_dim_names_match_and_align( d_map_domain, d_split_iname) assert d_map_domain_aligned == d_split_iname for litem_map_domain, litem_split_iname in zip( lin_knl_map_dom.linearization, lin_knl_split_iname.linearization): assert litem_map_domain == litem_split_iname # Can't easily compare instructions because equivalent subscript # expressions may have different orders lp.auto_test_vs_ref(proc_knl_split_iname, ctx_factory(), proc_knl_map_dom, parameters={"nx": 32, "nt": 32, "m": 32}) # }}} # }}} # {{{ Make sure we error on a map that is not bijective # Not bijective transform_map = isl.BasicMap( "[nx,nt] -> {[t, y, rogue] -> [t_new, y_new]: " "y = y_new and t = t_new" "}") from loopy.diagnostic import LoopyError knl = ref_knl try: knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert "map must be bijective" in str(err) # }}} # {{{ Make sure there's an error if transform map does not apply to # exactly one domain. test_maps = [ # Map where some inames match exactly one domain but there's also a # rogue dim isl.BasicMap( "[nx,nt] -> {[t, y, rogue] -> [t_new, y_new, rogue_new]: " "y = y_new and t = t_new and rogue = rogue_new" "}"), # Map where all inames match exactly one domain but there's also a # rogue dim isl.BasicMap( "[nx,nt] -> {[t, y, x, z, rogue] -> " "[t_new, y_new, x_new, z_new, rogue_new]: " "y = y_new and t = t_new and x = x_new and z = z_new " "and rogue = rogue_new" "}"), # Map where no inames match any domain isl.BasicMap( "[nx,nt] -> {[rogue] -> [rogue_new]: " "rogue = rogue_new" "}"), ] for transform_map in test_maps: try: knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert ( "was not applicable to any domain. " "Transform map must be applicable to exactly one domain." in str(err)) # }}} # {{{ Make sure there's an error if we try to map inames in priorities knl = ref_knl knl = lp.prioritize_loops(knl, "y, z") knl = lp.prioritize_loops(knl, "x, z") try: transform_map = isl.BasicMap( "[nx,nt] -> {[t, y] -> [t_new, y_new]: " "y = y_new and t = t_new }") knl = lp.map_domain(knl, transform_map) raise AssertionError() except ValueError as err: assert ( "Loop priority ('y', 'z') contains iname(s) " "transformed by map" in str(err)) # }}} # {{{ Make sure we error when stmt.within_inames contains at least one but # not all mapped inames # {{{ Make potentially problematic kernel knl = lp.make_kernel( [ "[n, m] -> { [i, j]: 0 <= i < n and 0 <= j < m }", "[ell] -> { [k]: 0 <= k < ell }", ], """ for i <>t0 = i {id=stmt0} for j <>t1 = j {id=stmt1, dep=stmt0} end <>t2 = i + 1 {id=stmt2, dep=stmt1} end for k <>t3 = k {id=stmt3, dep=stmt2} end """, lang_version=(2018, 2), ) # }}} # This should fail: try: transform_map = isl.BasicMap( "[n, m] -> {[i, j] -> [i_new, j_new]: " "i_new = i + j and j_new = 2 + i }") knl = lp.map_domain(knl, transform_map) raise AssertionError() except LoopyError as err: assert ( "Statements must be within all or none of the mapped inames" in str(err)) # This should succeed: transform_map = isl.BasicMap( "[n, m] -> {[i] -> [i_new]: i_new = i + 2 }") knl = lp.map_domain(knl, transform_map) # }}} # }}} def test_diamond_tiling(ctx_factory, interactive=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ref_knl = lp.make_kernel( "[nx,nt] -> {[ix, it]: 1<=ix {[ix, it] -> [tx, tt, tparity, itt, itx]: " "16*(tx - tt) + itx - itt = ix - it and " "16*(tx + tt + tparity) + itt + itx = ix + it and " "0<=tparity<2 and 0 <= itx - itt < 16 and 0 <= itt+itx < 16}") knl = lp.map_domain(knl_for_transform, m) knl = lp.prioritize_loops(knl, "tt,tparity,tx,itt,itx") if interactive: nx = 43 u = np.zeros((nx, 200)) x = np.linspace(-1, 1, nx) dx = x[1] - x[0] u[:, 0] = u[:, 1] = np.exp(-100*x**2) u_dev = cl.array.to_device(queue, u) knl(queue, u=u_dev, dx=dx, dt=dx) u = u_dev.get() import matplotlib.pyplot as plt # pylint: disable=import-error plt.imshow(u.T) plt.show() else: types = {"dt,dx,u": np.float64} knl = lp.add_and_infer_dtypes(knl, types) ref_knl = lp.add_and_infer_dtypes(ref_knl, types) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={ "nx": 200, "nt": 300, "dx": 1, "dt": 1 }) def test_extract_subst_with_iname_deps_in_templ(ctx_factory): knl = lp.make_kernel( "{[i, j, k]: 0<=i<100 and 0<=j,k<5}", """ y[i, j, k] = x[i, j, k] """, [lp.GlobalArg("x,y", shape=lp.auto, dtype=float)], lang_version=(2018, 2)) knl = lp.extract_subst(knl, "rule1", "x[i, arg1, arg2]", parameters=("arg1", "arg2")) lp.auto_test_vs_ref(knl, ctx_factory(), knl) def test_prefetch_local_into_private(): # https://gitlab.tiker.net/inducer/loopy/-/issues/210 n = 32 m = 32 n_vecs = 32 knl = lp.make_kernel( """{[k,i,j]: 0<=k {[i,j]: 0<=i,j alpha = 2.0 {id=init_alpha} for i for j c[i, j] = alpha*a[i]*b[j] {id=outerproduct} end end """ ], [ lp.GlobalArg("a", dtype, shape=("n",), order=order), lp.GlobalArg("b", dtype, shape=("n",), order=order), lp.GlobalArg("c", dtype, shape=("n, n"), order=order), lp.ValueArg("n", np.int32, approximately=n), ], name="rank_one", assumptions="n >= 16", lang_version=(2018, 2)) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.add_inames_for_unused_hw_axes(knl) assert (knl["rank_one"].id_to_insn["init_alpha"].within_inames == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) assert (knl["rank_one"].id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) assert (knl["rank_one"].id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], parameters={"n": n}) def test_rename_argument_of_domain_params(ctx_factory): knl = lp.make_kernel( "{[i, j]: 0<=i{: n_new=10}")) == assumptions) def test_tag_iname_with_match_pattern(): knl = lp.make_kernel( "{[i0, i1]: 0<=i0, i1 tmp = j out[j] = tmp end """, name="arange_10_to_14", seq_dependencies=True) knl = lp.privatize_temporaries_with_inames(knl, {"j"}) assert knl["arange_10_to_14"].temporary_variables["tmp"].shape == (4,) _, (out, ) = knl(queue) np.testing.assert_allclose(out.get()[10:14], np.arange(10, 14)) def test_unprivatize(): knl = lp.make_kernel( ["{[icoeff]: 0<=icoeff<10}", "{[tgt_box]: 0<=tgt_box<20}", "{[src_box]: 0<=src_box<30}"], """ for tgt_box <> temp[icoeff] = 0 {dup=icoeff} for src_box for icoeff temp[icoeff] = temp[icoeff] + \ deriv[icoeff] * src_coeffs[src_box, icoeff] end end tgt_coeffs[tgt_box, icoeff] = temp[icoeff] {dup=icoeff} end """, name="unprivatize_m2l", seq_dependencies=True) knl = lp.rename_inames(knl, ["icoeff_0", "icoeff", "icoeff_1"], "icoeff0") knl = lp.unprivatize_temporaries_with_inames(knl, {"icoeff0"}, {"temp"}) assert knl["unprivatize_m2l"].temporary_variables["temp"].shape == () def test_unprivatize_error(): knl = lp.make_kernel( ["{[i]: 0<=i<10}", "{[j]: 0<=j<10}", "{[tgt_box]: 0<=tgt_box<20}", "{[src_box]: 0<=src_box<30}"], """ for tgt_box <> temp[i, j] = 0 {dup=i:j} for src_box for i, j temp[j, i] = temp[i, j] + deriv[i, j] * \ src_coeffs[src_box, i, j] end end tgt_coeffs[tgt_box, i, j] = temp[i, j] {dup=i:j} end """, name="unprivatize_m2l", seq_dependencies=True) knl = lp.rename_inames(knl, ["i_0", "i_1", "i"], "i0") knl = lp.rename_inames(knl, ["j_0", "j_1", "j"], "j0") with pytest.raises(lp.LoopyError): knl = lp.unprivatize_temporaries_with_inames(knl, {"i0"}, {"temp"}) with pytest.raises(lp.LoopyError): knl = lp.unprivatize_temporaries_with_inames(knl, {"i0", "j0"}, {"temp"}) def test_privatize_unprivatize_roundtrip(): knl1 = lp.make_kernel( ["{[i]: 0<=i<10}", "{[imatrix]: 0<=imatrix<20}", "{[k]: 0<=k<30}"], """ for imatrix, i <> acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end """, name="privatize_unprivatize_roundtrip", seq_dependencies=True) knl2 = lp.make_kernel( ["{[i]: 0<=i<10}", "{[imatrix]: 0<=imatrix<20}", "{[k]: 0<=k<30}"], """ for imatrix, i <> acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end """, name="privatize_unprivatize_roundtrip", seq_dependencies=True) assert knl2 == lp.unprivatize_temporaries_with_inames(knl1, {"imatrix"}, {"acc"}) assert knl1 == lp.privatize_temporaries_with_inames(knl2, {"imatrix"}, {"acc"}) def test_simplify_indices_when_inlining(ctx_factory): ctx = ctx_factory() twice = lp.make_function( "{[i, j]: 0<=i<10 and 0<=j<4}", """ y[i,j] = 2*x[i,j] """, name="zerozerozeroonezeroify") knl = lp.make_kernel( "{:}", """ Y[:,:] = zerozerozeroonezeroify(X[:,:]) """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)]) knl = lp.merge([knl, twice]) inlined_knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify") contains_floordiv = ContainsFloorDiv() print(inlined_knl) assert all(not contains_floordiv(insn.expression) for insn in inlined_knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) lp.auto_test_vs_ref(knl, ctx, inlined_knl) def test_simplify_indices(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<10}", """ <> b = Z[0] {id=b} Y[j] = X[10*(j//10 + b) + j - 10*b] {dep=b} """, [lp.GlobalArg("X,Y,Z", shape=(10,), dtype=np.int32)]) simplified_knl = lp.simplify_indices(knl) contains_floordiv = ContainsFloorDiv() assert any(contains_floordiv(insn.expression) for insn in knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) assert all(not contains_floordiv(insn.expression) for insn in simplified_knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) lp.auto_test_vs_ref(knl, ctx, simplified_knl) def test_precompute_does_not_lead_to_dep_cycle(ctx_factory): # See https://github.com/inducer/loopy/issues/498 ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<10}", """ <> tmp0[i] = 2 * i <> tmp1[i] = 2 * tmp0[i] <> tmp2[i] = 3 * tmp1[i] out[i] = 2*tmp1[i] + 3*tmp2[i] """) ref_knl = knl knl = lp.assignment_to_subst(knl, "tmp1") knl = lp.precompute(knl, "tmp1_subst") lp.auto_test_vs_ref(knl, ctx, ref_knl) def test_rename_inames_redn(): t_unit = lp.make_kernel( "{[i, j0, j1]: 0<=i, j0, j1<10}", """ y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) """) t_unit = lp.rename_iname(t_unit, "j1", "ifused") assert "j1" not in t_unit.default_entrypoint.all_inames() assert "ifused" in t_unit.default_entrypoint.all_inames() def test_rename_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i1, i2]: 0<=i1, i2<10}", """ y1[i1] = 2 y2[i2] = 3 """) ref_knl = knl knl = lp.rename_inames(knl, ["i1", "i2"], "ifused") lp.auto_test_vs_ref(knl, ctx, ref_knl) def test_buffer_array_preserves_rev_deps(ctx_factory): # See https://github.com/inducer/loopy/issues/546 ctx = ctx_factory() knl = lp.make_kernel( ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}", "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"], """ out0[i0] = sum(j0, A[i0] * x[j0]) ... gbarrier {id=gbarrier} out1[i1] = sum(j1, A[i1] * x[j1]) """, seq_dependencies=True) knl = lp.add_dtypes(knl, {"A": np.float64, "x": np.float64}) ref_knl = knl knl = lp.split_iname(knl, "j0", 2) knl = lp.split_iname(knl, "i0", 2, outer_tag="g.0") knl = lp.buffer_array(knl, "out0", buffer_inames=["i0_inner"], init_expression="0") assert "store_out0" in knl.default_entrypoint.id_to_insn["gbarrier"].depends_on lp.auto_test_vs_ref(ref_knl, ctx, knl) def test_rename_inames_existing_ok(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i1, i2, i3]: 0<=i1, i2, i3<10}", """ y1[i1] = 2 y2[i2] = 3 y3[i3] = 4 """) ref_knl = knl knl = lp.rename_inames(knl, ["i1", "i2"], "i3", existing_ok=True) lp.auto_test_vs_ref(knl, ctx, ref_knl) def test_precompute_with_gbarrier(ctx_factory): # See https://github.com/inducer/loopy/issues/543 ctx = ctx_factory() t_unit = lp.make_kernel( ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}", "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"], """ out0[i0] = sum(j0, A[i0] * x[j0]) ... gbarrier {id=gbarrier} out1[i1] = sum(j1, A[i1] * x[j1]) """, seq_dependencies=True) t_unit = lp.add_dtypes(t_unit, {"A": np.float64, "x": np.float64}) ref_t_unit = t_unit t_unit = lp.add_prefetch(t_unit, "x", sweep_inames=["j1"], within="writes:out1", prefetch_insn_id="x_fetch", default_tag="l.auto") assert "gbarrier" in t_unit.default_entrypoint.id_to_insn["x_fetch"].depends_on lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) def test_buffer_array_with_within(ctx_factory): ctx = ctx_factory() t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ out[i] = 2 * x[i] {id=insn} """) t_unit = lp.add_dtypes(t_unit, {"x": "float64"}) ref_t_unit = t_unit t_unit = lp.buffer_array(t_unit, "out", buffer_inames=[], within="id:insn") lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) def test_redn_iname_unique_preserves_metadata(): class FooTag(Tag): """ foo! """ t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ out = sum(i, 2*i) + sum(i, 3*i) {id=w_out} """) t_unit = lp.tag_inames(t_unit, {"i": FooTag()}) t_unit = lp.make_reduction_inames_unique(t_unit) assert "i_0" in t_unit.default_entrypoint.id_to_insn["w_out"].reduction_inames() assert t_unit.default_entrypoint.inames["i_0"].tags_of_type(FooTag) # fails def test_prefetch_to_same_temp_var(ctx_factory): ctx = ctx_factory() # loopy.git<=5d83454 would raise with a dtype mismatch during the second # prefetch call. t_unit = lp.make_kernel( "{[i0, i1, j0, j1]: 0<=i0, i1<1000 and 0<=j0, j1<10}", """ y0[i0] = sum(j0, A[j0] * x0[i0, j0]) y1[i1] = sum(j1, A[j1] * x1[i1, j1]) """) t_unit = lp.add_dtypes(t_unit, {"A": "float64", "x0": "float64", "x1": "float64"}) ref_tunit = t_unit t_unit = lp.add_prefetch(t_unit, "A", sweep_inames=["j0"], within="iname:i0", temporary_name="A_fetch", prefetch_insn_id="first_fetch" ) t_unit = lp.add_prefetch(t_unit, "A", sweep_inames=["j1"], within="iname:i1", temporary_name="A_fetch", prefetch_insn_id="second_fetch" ) t_unit = lp.add_dependency(t_unit, "writes:y1 or writes:y0", "id:second_fetch or id:first_fetch") t_unit = lp.add_dependency(t_unit, "id:first_fetch", "id:second_fetch") t_unit = lp.add_dependency(t_unit, "id:first_fetch", "id:second_fetch") lp.auto_test_vs_ref(ref_tunit, ctx, t_unit) def test_concatenate_arrays(ctx_factory): ctx = ctx_factory() t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ <> a[i] = x[i] {id=init_a} <> b[i] = y[i] {id=init_b} out[i] = a[i] + b[i] {id=insn,dep=init_a:init_b} """) t_unit = lp.add_dtypes(t_unit, {"x": "float64", "y": "float64"}) ref_t_unit = t_unit t_unit = lp.concatenate_arrays(t_unit, ["a", "b"], "c") assert t_unit.default_entrypoint.temporary_variables["c"].shape == (20,) lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) def test_remove_inames_from_insn(): t_unit = lp.make_kernel( "{[i, j]: 0<=i<10 and 0<=j<20}", """ for i <> a[j] = 1 {id=a} b[i] = a[2*i] {dep=a} end """) t_unit = lp.add_dtypes(t_unit, {"b": "int32"}) t_unit = lp.split_iname(t_unit, "i", 2, inner_tag="l.0") t_unit = lp.split_iname(t_unit, "j", 2, inner_tag="l.0") t_unit = lp.remove_inames_from_insn(t_unit, frozenset(["i_inner"]), "id:a") # Check that the instruction a does not have multiple tagged inames lp.generate_code_v2(t_unit).device_code() def test_remove_predicates_from_insn(): import pymbolic.primitives as prim t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ <> cond = i > 5 {id=cond} a[i] = 1 {if=cond,id=a,dep=cond} """) ref_t_unit = lp.make_kernel( "{[i]: 0<=i<10}", """ <> cond = i > 5 {id=cond} a[i] = 1 {id=a,dep=cond} """) cond = prim.Variable("cond") t_unit = lp.remove_predicates_from_insn(t_unit, frozenset([cond]), "id:a") assert t_unit == ref_t_unit def test_precompute_lets_length1_inames_live_if_requested(): t_unit = lp.make_kernel( "{[e,i]: 0<=e<1 and 0<=i<10}", """ v(e, i) := e + i out[e, i] = v(e, i) """) t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True) from pymbolic import parse assert t_unit.default_entrypoint.id_to_insn["v"].expression == parse("e + i_0") def test_precompute_lets_inner_length1_inames_live(): t_unit = lp.make_kernel( "{[e,i]: 0<=e<1 and 0<=i<10}", """ v(e, i) := e / i #v(eee, i) := eee + i out[e, i] = v(e, i) """) t_unit = lp.split_iname(t_unit, "e", 16) t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True) from pymbolic import parse assert ( t_unit.default_entrypoint.id_to_insn["v"].expression == parse("(e_inner + e_outer*16) / i_0")) def test_duplicate_iname_not_read_only_nested(ctx_factory): # See ctx = ctx_factory() t_unit = lp.make_kernel( "{[i, j]: 0<=i,j<10}", """ for i <> acc = 0 {id=init, tags=foo} for j acc = acc + A[i, j] * x[i, j] {id=update, tags=foo} end y[i] = acc {id=assign, tags=foo} end """, [lp.GlobalArg("A,x,y", shape=lp.auto, dtype=np.float32), ...], seq_dependencies=True, ) ref_t_unit = t_unit t_unit = lp.duplicate_inames( t_unit, inames="i", within="tag:foo", new_inames="irow") print(t_unit) assert (t_unit.default_entrypoint.id_to_insn["init"].within_inames == frozenset({"irow"})) lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) if __name__ == "__main__": import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: from pytest import main main([__file__]) # vim: foldmethod=marker