diff --git a/loopy/statistics.py b/loopy/statistics.py index 0607a769ef028304fb299ac77d4b829a9a535404..ed21dd0450b6797f9e3c4ded419233ef36b29967 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -460,7 +460,7 @@ def stringify_stats_mapping(m): class CountGranularity: """Strings specifying whether an operation should be counted once per - *work-item*, *sub-group*, or *group*. + *work-item*, *sub-group*, or *work-group*. .. attribute :: WORKITEM @@ -472,15 +472,15 @@ class CountGranularity: A :class:`str` that specifies that an operation should be counted once per *sub-group*. - .. attribute :: GROUP + .. attribute :: WORKGROUP A :class:`str` that specifies that an operation should be counted - once per *group*. + once per *work-group*. """ WORKITEM = "workitem" SUBGROUP = "subgroup" - GROUP = "group" + WORKGROUP = "workgroup" # {{{ Op descriptor @@ -501,11 +501,11 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. A work-item is a single - instance of computation executing on a single processor (think + once per *work-item*, *sub-group*, or *work-group*. A work-item is a + single instance of computation executing on a single processor (think 'thread'), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all - work-items within the group sharing local memory. A sub-group is an + work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -513,7 +513,7 @@ class Op(Record): count_granularity_options = [CountGranularity.WORKITEM, CountGranularity.SUBGROUP, - CountGranularity.GROUP, + CountGranularity.WORKGROUP, None] def __init__(self, dtype=None, name=None, count_granularity=None): @@ -572,11 +572,11 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. A work-item is a single - instance of computation executing on a single processor (think + once per *work-item*, *sub-group*, or *work-group*. A work-item is a + single instance of computation executing on a single processor (think 'thread'), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all - work-items within the group sharing local memory. A sub-group is an + work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -584,7 +584,7 @@ class MemAccess(Record): count_granularity_options = [CountGranularity.WORKITEM, CountGranularity.SUBGROUP, - CountGranularity.GROUP, + CountGranularity.WORKGROUP, None] def __init__(self, mtype=None, dtype=None, stride=None, direction=None, @@ -1461,31 +1461,31 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, knl, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) - if count_granularity == CountGranularity.GROUP: + if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() - group_size = 1 + workgroup_size = 1 if local_size: for size in local_size: s = aff_to_expr(size) if not isinstance(s, int): raise LoopyError("Cannot count insn with %s granularity, " - "group size is not integer: %s" + "work-group size is not integer: %s" % (CountGranularity.SUBGROUP, local_size)) - group_size *= s + workgroup_size *= s warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " - "count_granularity=%s, using upper bound for group size " - "(%d work-items) to compute sub-groups per group. When multiple " - "device programs present, actual sub-group count may be lower." - % (insn_id, CountGranularity.SUBGROUP, group_size)) + "count_granularity=%s, using upper bound for work-group size " + "(%d work-items) to compute sub-groups per work-group. When " + "multiple device programs present, actual sub-group count may be" + "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil - return ct_disregard_local*div_ceil(group_size, subgroup_size) + return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) else: # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" diff --git a/test/test_statistics.py b/test/test_statistics.py index 7a5d13949ff88f369893c32df51fe834199816e5..bdc64cf837c14162a3fea83c2bd52bda4747000d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -261,7 +261,7 @@ def test_mem_access_counter_basic(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -282,9 +282,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group - assert f64l == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group + assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', @@ -295,9 +295,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32s == (n*m*ell)*n_groups*subgroups_per_group - assert f64s == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group + assert f64s == (n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_reduction(): @@ -320,7 +320,7 @@ def test_mem_access_counter_reduction(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -333,16 +333,16 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32s == (n*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32s == (n*ell)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -376,7 +376,7 @@ def test_mem_access_counter_logic(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -392,10 +392,10 @@ def test_mem_access_counter_logic(): direction='store') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group - assert f64_g_l == (n*m)*n_groups*subgroups_per_group - assert f64_g_s == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group + assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group + assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_specialops(): @@ -422,7 +422,7 @@ def test_mem_access_counter_specialops(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -443,9 +443,9 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group - assert f64 == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group + assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', @@ -456,16 +456,16 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32 == (n*m*ell)*n_groups*subgroups_per_group - assert f64 == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group + assert f64 == (n*m)*n_workgroups*subgroups_per_group filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_bitwise(): @@ -494,7 +494,7 @@ def test_mem_access_counter_bitwise(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -515,8 +515,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', @@ -527,8 +527,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group def test_mem_access_counter_mixed(): @@ -557,7 +557,7 @@ def test_mem_access_counter_mixed(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = div_ceil(ell, group_size_0) + n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -586,9 +586,9 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f64uniform == (2*n*m)*n_groups*subgroups_per_group - assert f32uniform == (m*n)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group + assert f32uniform == (m*n)*n_workgroups*subgroups_per_group expect_fallback = False import islpy as isl @@ -601,9 +601,9 @@ def test_mem_access_counter_mixed(): if expect_fallback: if ell < group_size_0: - assert f32nonconsec == 3*n*m*ell*n_groups + assert f32nonconsec == 3*n*m*ell*n_workgroups else: - assert f32nonconsec == 3*n*m*n_groups*group_size_0 + assert f32nonconsec == 3*n*m*n_workgroups*group_size_0 else: assert f32nonconsec == 3*n*m*ell @@ -617,14 +617,14 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f64uniform == m*n*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f64uniform == m*n*n_workgroups*subgroups_per_group if expect_fallback: if ell < group_size_0: - assert f32nonconsec == n*m*ell*n_groups + assert f32nonconsec == n*m*ell*n_workgroups else: - assert f32nonconsec == n*m*n_groups*group_size_0 + assert f32nonconsec == n*m*n_workgroups*group_size_0 else: assert f32nonconsec == n*m*ell @@ -779,7 +779,7 @@ def test_count_granularity_val_checks(): try: lp.MemAccess(count_granularity=CG.WORKITEM) lp.MemAccess(count_granularity=CG.SUBGROUP) - lp.MemAccess(count_granularity=CG.GROUP) + lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity='bushel') @@ -790,7 +790,7 @@ def test_count_granularity_val_checks(): try: lp.Op(count_granularity=CG.WORKITEM) lp.Op(count_granularity=CG.SUBGROUP) - lp.Op(count_granularity=CG.GROUP) + lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity='bushel') @@ -977,7 +977,7 @@ def test_summations_and_filters(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -988,15 +988,15 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] @@ -1005,9 +1005,9 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group - assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group + assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -1016,9 +1016,9 @@ def test_summations_and_filters(): f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32lall == (3*n*m*ell)*n_groups*subgroups_per_group - assert f64lall == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group + assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -1052,8 +1052,8 @@ def test_summations_and_filters(): key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert s1f64l == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert s1f64l == (2*n*m)*n_workgroups*subgroups_per_group def test_strided_footprint():