diff --git a/loopy/statistics.py b/loopy/statistics.py index 2305144aca15e4c4516a0a67c17a079e20d178ae..2b5e3876ef2e1bf0be39d362359cfe18c732d3a2 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -501,7 +501,7 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *workitem*, *subgroup*, or *group*. + once per *work-item*, *sub-group*, or *group*. """ @@ -566,7 +566,7 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *workitem*, *subgroup*, or *group*. + once per *work-item*, *sub-group*, or *group*. """ @@ -1323,7 +1323,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, :arg subgroup_size: A :class:`int` that specifies the sub-group size. This is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. The default - subgroup_size is 32. + sub-group_size is 32. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1380,12 +1380,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ from loopy.preprocess import preprocess_kernel, infer_unknown_types - if subgroup_size is None: - subgroup_size = 32 - warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", - "get_mem_access_map: No subgroup size passed, " - "assuming subgroup size is %d." - % (subgroup_size)) + if not isinstance(subgroup_size, int): + # try to find subgroup_size + from loopy.target.pyopencl import PyOpenCLTarget + if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: + from pyopencl.characterize import get_simd_group_size + subgroup_size_guess = get_simd_group_size(knl.target.device, None) + warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", + "subgroup_size passed: %s. Device: %s. Using " + "sub-group size given by get_simd_group_size(): %d" + % (subgroup_size, knl.target.device, + subgroup_size_guess)) + subgroup_size = subgroup_size_guess + elif subgroup_size == 'guess': + # unable to get subgroup_size from device, so guess + subgroup_size = 32 + warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", + "get_mem_access_map: 'guess' sub-group size passed, " + "no target device found, wildly guessing that " + "sub-group size is %d." + % (subgroup_size)) + + if subgroup_size is None: + # 'guess' was not passed and either no target device found + # or get_simd_group_size returned None + raise ValueError("No sub-group size passed and no target device found. " + "Either (1) pass integer value for subgroup_size, " + "(2) ensure that kernel.target is PyOpenClTarget " + "and kernel.target.device is set, or (3) pass " + "subgroup_size='guess' and hope for the best.") class CacheHolder(object): pass @@ -1432,8 +1455,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for group size " - "(%d workitems) to compute subgroups per group. When multiple " - "device programs present, actual subgroup count may be lower." + "(%d work-items) to compute sub-groups per group. When multiple " + "device programs present, actual sub-group count may be lower." % (insn_id, CountGranularity.SUBGROUP, group_size)) from pytools import div_ceil diff --git a/test/test_statistics.py b/test/test_statistics.py index 25c6dffee401d5bbb58afdace147d793b61467ea..9bfea34ab2cd65b8e20df6fbc164f269fc5d5cc4 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -645,7 +645,8 @@ def test_mem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=32) # noqa n = 512 m = 256 ell = 128 @@ -735,7 +736,8 @@ def test_mem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size='guess') n = 512 m = 256 ell = 128 @@ -889,13 +891,14 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*ell*2 - op_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=32) - f32s1lb = op_map[lp.MemAccess('global', np.float32, + f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b', count_granularity=cg.WORKITEM) ].eval_with_dict(params) - f32s1la = op_map[lp.MemAccess('global', np.float32, + f32s1la = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', count_granularity=cg.WORKITEM) ].eval_with_dict(params) @@ -903,7 +906,7 @@ def test_all_counters_parallel_matmul(): assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize - f32coal = op_map[lp.MemAccess('global', np.float32, + f32coal = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', count_granularity=cg.WORKITEM) ].eval_with_dict(params) @@ -911,7 +914,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, - count_redundant_work=True).filter_by(mtype=['local']) + count_redundant_work=True, + subgroup_size=32).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', count_granularity=cg.WORKITEM)