diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3d44826bb47026c11ff28d95b1fa758bda9206f4..31cc94e733dc10ff2e26d9fb8d0ff28d757b4975 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -362,7 +362,7 @@ class ToCountMap(object):
     def to_bytes(self):
         """Convert counts to bytes using data type in map key.
 
-        :return: A :class:`ToCountMap` mapping each original key to a
+        :return: A :class:`ToCountMap` mapping each original key to an
             :class:`islpy.PwQPolynomial` with counts in bytes rather than
             instances.
 
@@ -404,7 +404,7 @@ class ToCountMap(object):
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the
+        :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the
             sum of counts.
 
         """
@@ -462,17 +462,17 @@ class CountGranularity:
     """Strings specifying whether an operation should be counted once per
     *work-item*, *sub-group*, or *work-group*.
 
-    .. attribute :: WORKITEM
+    .. attribute:: WORKITEM
 
        A :class:`str` that specifies that an operation should be counted
        once per *work-item*.
 
-    .. attribute :: SUBGROUP
+    .. attribute:: SUBGROUP
 
        A :class:`str` that specifies that an operation should be counted
        once per *sub-group*.
 
-    .. attribute :: WORKGROUP
+    .. attribute:: WORKGROUP
 
        A :class:`str` that specifies that an operation should be counted
        once per *work-group*.
@@ -503,11 +503,13 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
-       single instance of computation executing on a single processor (think
-       'thread'), a collection of which may be grouped together into a
-       work-group. Each work-group executes on a single compute unit with all
-       work-items within the work-group sharing local memory. A sub-group is an
+       once per *work-item*, *sub-group*, or *work-group*. The granularities
+       allowed can be found in :class:`CountGranularity`, and may be accessed,
+       e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
+       of computation executing on a single processor (think 'thread'), a
+       collection of which may be grouped together into a work-group. Each
+       work-group executes on a single compute unit with all work-items within
+       the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -569,11 +571,13 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
-       single instance of computation executing on a single processor (think
-       'thread'), a collection of which may be grouped together into a
-       work-group. Each work-group executes on a single compute unit with all
-       work-items within the work-group sharing local memory. A sub-group is an
+       once per *work-item*, *sub-group*, or *work-group*. The granularities
+       allowed can be found in :class:`CountGranularity`, and may be accessed,
+       e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
+       of computation executing on a single processor (think 'thread'), a
+       collection of which may be grouped together into a work-group. Each
+       work-group executes on a single compute unit with all work-items within
+       the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -1238,12 +1242,17 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
-    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
-        sub-group size. An OpenCL sub-group is an implementation-dependent
-        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
-        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
-        whose count_granularity specifies that it should only be counted once
-        per sub-group.
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        sub-group is an implementation-dependent grouping of work-items within
+        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. If set to
+        *None* an attempt to find the sub-group size using the device will be
+        made, if this fails an error will be raised. If a :class:`string`
+        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        attempt to find the sub-group size using the device and, if
+        unsuccessful, will make a wild guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1345,16 +1354,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: A :class:`int` that specifies the sub-group size. An
-        OpenCL sub-group is an implementation-dependent grouping of work-items
-        within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is
-        used, e.g., when counting a :class:`MemAccess` whose count_granularity
-        specifies that it should only be counted once per sub-group. If set to
-        None an attempt to find the sub-group size using the device will be
-        made, if this fails an error will be raised. A :class:`string` 'guess'
-        may also be passed as the subgroup_size, in which case
-        get_mem_access_map will attempt to find the sub-group size using the
-        device and, if unsuccessful, will make a wild guess.
+    :arg subgroup_size: An :class:`int`, :class:`string` ``'guess'``, or
+        *None* that specifies the sub-group size. An OpenCL sub-group is an
+        implementation-dependent grouping of work-items within a work-group,
+        analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
+        counting a :class:`MemAccess` whose count_granularity specifies that it
+        should only be counted once per sub-group. If set to *None* an attempt
+        to find the sub-group size using the device will be made, if this fails
+        an error will be raised. If a :class:`string` ``'guess'`` is passed as
+        the subgroup_size, get_mem_access_map will attempt to find the
+        sub-group size using the device and, if unsuccessful, will make a wild
+        guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1574,15 +1584,21 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
-    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
-        sub-group size. An OpenCL sub-group is an implementation-dependent
-        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
-        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
-        whose count_granularity specifies that it should only be counted once
-        per sub-group.
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        sub-group is an implementation-dependent grouping of work-items within
+        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. If set to
+        *None* an attempt to find the sub-group size using the device will be
+        made, if this fails an error will be raised. If a :class:`string`
+        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        attempt to find the sub-group size using the device and, if
+        unsuccessful, will make a wild guess.
 
-    :return: A dictionary mapping each type of synchronization event to a
-        :class:`islpy.PwQPolynomial` holding the number of events per work-item.
+    :return: A dictionary mapping each type of synchronization event to an
+        :class:`islpy.PwQPolynomial` holding the number of events per
+        work-item.
 
         Possible keys include ``barrier_local``, ``barrier_global``
         (if supported by the target) and ``kernel_launch``.
@@ -1794,7 +1810,8 @@ def get_synchronization_poly(knl):
     """Count the number of synchronization events each work-item encounters in
     a loopy kernel.
 
-    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+    get_synchronization_poly is deprecated. Use get_synchronization_map
+    instead.
 
     """
     warn_with_kernel(knl, "deprecated_get_synchronization_poly",