__copyright__ = "Copyright (C) 2015 James Stevens" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import logging import numpy as np from pymbolic.primitives import Variable from pyopencl.tools import ( # noqa: F401 pytest_generate_tests_for_pyopencl as pytest_generate_tests, ) from pytools import div_ceil import loopy as lp from loopy.statistics import CountGranularity as CG from loopy.types import to_loopy_type from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 logger = logging.getLogger(__name__) SGS = 32 # Subgroup size def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i6 or k/2==ell) else \ (g[i, k] + h[i, k] / 2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, {"g": np.float32, "h": np.float64}) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, "logic")].eval_with_dict( params) f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP, "logic")].eval_with_dict( params) f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP, "logic") ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, "logic") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups assert f64div == 2*n*m*n_subgroups # TODO why? assert f64add == n*m*n_subgroups assert i32add == n*m*n_subgroups def test_op_counter_special_ops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, { "a": np.int32, "b": np.int32, "g": np.int64, "h": np.int64}) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_within_subscripts=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} print(op_map) i32add = op_map[ lp.Op(np.int32, "add", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i32bw = op_map[ lp.Op(np.int32, "bw", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64bw = op_map[ lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64mul = op_map[ lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64add = op_map[ lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) i64shift = op_map[ lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP, "bitwise") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups assert i64shift == 2*n*m*n_subgroups def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, {"g": np.float32, "h": np.float64}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group reduced_map = mem_map.group_by("mtype", "dtype", "direction") f32_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float32), direction="load") ].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), direction="load") ].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess("global", to_loopy_type(np.float64), direction="store") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32_g_l == (2*n*m)*n_subgroups assert f64_g_l == (n*m)*n_subgroups assert f64_g_s == (n*m)*n_subgroups def test_mem_access_counter_special_ops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, { "a": np.int32, "b": np.int32, "g": np.int32, "h": np.int32}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, direction="load", variable="a", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, direction="load", variable="b", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, direction="load", variable="g", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32), lid_strides={}, gid_strides={}, direction="load", variable="h", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, direction="store", variable="c", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) i32 += mem_map[lp.MemAccess("global", np.int32, lid_strides={}, gid_strides={}, direction="store", variable="e", count_granularity=CG.SUBGROUP, kernel_name="bitwise") ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (n*m+n*m*ell)*n_subgroups def test_mem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,j,k]: 0<=ia[i, j, k] = 3.1 <>b[i, j] = 3.2 """, assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, {"a,b": np.float32}) # Change temporary b address space knl = lp.privatize_temporaries_with_inames(knl, "i,j", "b") knl = lp.set_temporary_address_space(knl, "b", "global") mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size="guess") n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} # Count global accesses global_accesses = mem_map.filter_by( mtype=["global"]).sum().eval_with_dict(params) assert global_accesses == n*m def test_count_granularity_val_checks(): try: lp.MemAccess(count_granularity=CG.WORKITEM) lp.MemAccess(count_granularity=CG.SUBGROUP) lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity="bushel") raise AssertionError() except ValueError: assert True try: lp.Op(count_granularity=CG.WORKITEM) lp.Op(count_granularity=CG.SUBGROUP) lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity="bushel") raise AssertionError() except ValueError: assert True def test_barrier_counter_nobarriers(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first} """ ], [ lp.TemporaryVariable("c", shape=(50, 10, 99)), ... ], name="weird2", ) knl = lp.add_and_infer_dtypes(knl, {"a": np.int32}) knl = lp.split_iname(knl, "k", 128, inner_tag="l.0") sync_map = lp.get_synchronization_map(knl) print(sync_map) n = 512 m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 def test_barrier_count_single(): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ <> c[i] = 15*i {id=yoink} c[i+1] = c[i] {dep=yoink} """) knl = lp.tag_inames(knl, {"i": "l.0"}) sync_map = lp.get_synchronization_map(knl) print(sync_map) barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum() assert barrier_count == 1 def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i loc[i_inner,j_inner] = 3.14f {id=loc_init}", "loc[i_inner,(j_inner+r+4) %% %d] = loc[i_inner,(j_inner+r) %% %d]" " {id=add,dep=loc_init}" % (bsize, bsize), "out0[i_outer*16+i_inner,j_outer*16+j_inner] = loc[i_inner,j_inner]" " {id=store,dep=add}", "end", "end", ], name="local", lang_version=(2018, 2)) knl = lp.add_and_infer_dtypes(knl, {"out0": np.float32}) knl = lp.tag_inames(knl, "i_outer:g.1,i_inner:l.1,j_outer:g.0,j_inner:l.0") n = 512 rept = 64 params = {"n": n, "rept": rept} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(n, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group # count local f32 accesses m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert f32_local == 2*(rept+1)*n_subgroups def test_mem_access_tagged_variables(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i]: 0<=i 1: exec(sys.argv[1]) else: from pytest import main main([__file__])