diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2305144aca15e4c4516a0a67c17a079e20d178ae..2b5e3876ef2e1bf0be39d362359cfe18c732d3a2 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -501,7 +501,7 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *workitem*, *subgroup*, or *group*.
+       once per *work-item*, *sub-group*, or *group*.
 
     """
 
@@ -566,7 +566,7 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *workitem*, *subgroup*, or *group*.
+       once per *work-item*, *sub-group*, or *group*.
 
     """
 
@@ -1323,7 +1323,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     :arg subgroup_size: A :class:`int` that specifies the sub-group size. This
         is used, e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. The default
-        subgroup_size is 32.
+        sub-group_size is 32.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1380,12 +1380,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
-    if subgroup_size is None:
-        subgroup_size = 32
-        warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
-                         "get_mem_access_map: No subgroup size passed, "
-                         "assuming subgroup size is %d."
-                         % (subgroup_size))
+    if not isinstance(subgroup_size, int):
+        # try to find subgroup_size
+        from loopy.target.pyopencl import PyOpenCLTarget
+        if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None:
+            from pyopencl.characterize import get_simd_group_size
+            subgroup_size_guess = get_simd_group_size(knl.target.device, None)
+            warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
+                             "subgroup_size passed: %s. Device: %s. Using "
+                             "sub-group size given by get_simd_group_size(): %d"
+                             % (subgroup_size, knl.target.device,
+                                subgroup_size_guess))
+            subgroup_size = subgroup_size_guess
+        elif subgroup_size == 'guess':
+            # unable to get subgroup_size from device, so guess
+            subgroup_size = 32
+            warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
+                             "get_mem_access_map: 'guess' sub-group size passed, "
+                             "no target device found, wildly guessing that "
+                             "sub-group size is %d."
+                             % (subgroup_size))
+
+        if subgroup_size is None:
+            # 'guess' was not passed and either no target device found
+            # or get_simd_group_size returned None
+            raise ValueError("No sub-group size passed and no target device found. "
+                             "Either (1) pass integer value for subgroup_size, "
+                             "(2) ensure that kernel.target is PyOpenClTarget "
+                             "and kernel.target.device is set, or (3) pass "
+                             "subgroup_size='guess' and hope for the best.")
 
     class CacheHolder(object):
         pass
@@ -1432,8 +1455,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                     "get_insn_count: when counting instruction %s with "
                     "count_granularity=%s, using upper bound for group size "
-                    "(%d workitems) to compute subgroups per group. When multiple "
-                    "device programs present, actual subgroup count may be lower."
+                    "(%d work-items) to compute sub-groups per group. When multiple "
+                    "device programs present, actual sub-group count may be lower."
                     % (insn_id, CountGranularity.SUBGROUP, group_size))
 
             from pytools import div_ceil
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 25c6dffee401d5bbb58afdace147d793b61467ea..9bfea34ab2cd65b8e20df6fbc164f269fc5d5cc4 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -645,7 +645,8 @@ def test_mem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=32)  # noqa
     n = 512
     m = 256
     ell = 128
@@ -735,7 +736,8 @@ def test_mem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size='guess')
     n = 512
     m = 256
     ell = 128
@@ -889,13 +891,14 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*ell*2
 
-    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                           subgroup_size=32)
 
-    f32s1lb = op_map[lp.MemAccess('global', np.float32,
+    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
-    f32s1la = op_map[lp.MemAccess('global', np.float32,
+    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='a',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
@@ -903,7 +906,7 @@ def test_all_counters_parallel_matmul():
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
-    f32coal = op_map[lp.MemAccess('global', np.float32,
+    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
@@ -911,7 +914,8 @@ def test_all_counters_parallel_matmul():
     assert f32coal == n*ell
 
     local_mem_map = lp.get_mem_access_map(knl,
-                        count_redundant_work=True).filter_by(mtype=['local'])
+                        count_redundant_work=True,
+                        subgroup_size=32).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
                                              count_granularity=cg.WORKITEM)