diff --git a/loopy/statistics.py b/loopy/statistics.py index 2b5e3876ef2e1bf0be39d362359cfe18c732d3a2..d9bbd4b245a8fe60f19197ad6c6ac32e19f2278b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -501,7 +501,12 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. + once per *work-item*, *sub-group*, or *group*. A work-item is a single + instance of computation executing on a single processor (think 'thread'), + a collection of which may be grouped together into a work-group. Each + work-group executes on a single compute unit with all work-items within + the group sharing local memory. A sub-group is an implementation-dependent + grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. """ @@ -1320,10 +1325,15 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: A :class:`int` that specifies the sub-group size. This - is used, e.g., when counting a :class:`MemAccess` whose count_granularity - specifies that it should only be counted once per sub-group. The default - sub-group_size is 32. + :arg subgroup_size: A :class:`int` that specifies the sub-group size. An OpenCL + sub-group is an implementation-dependent grouping of work-items within a + work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., + when counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to None an attempt to find + the sub-group size using the device will be made. A :class:`string` 'guess' + may also be passed as the subgroup_size, in which case get_mem_access_map + will attempt to find the sub-group sizeusing the device and, if + unsuccessful, will make a wild guess. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.