diff --git a/loopy/statistics.py b/loopy/statistics.py
index fd1e2039cccff66ff58ed3acd56c9ad769c5d89b..e3dc2f5030aafe2789fa3feb1ead24b01ba7ec9e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -897,7 +897,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr), stride=0,
                                          variable=name,
-                                         count_granularity='warp'): 1}
+                                         count_granularity='subgroup'): 1}
                               ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
@@ -952,7 +952,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
             total_stride += stride*coeff_min_lid
 
-        count_granularity = 'thread' if total_stride is not 0 else 'warp'
+        count_granularity = 'thread' if total_stride is not 0 else 'subgroup'
 
         return ToCountMap({MemAccess(
                             mtype='global',
@@ -1284,7 +1284,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 # {{{ get_mem_access_map
 
 def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
-                       wsize=None):
+                       subgroup_size=None):
     """Count the number of memory accesses in a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
@@ -1351,12 +1351,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
-    if wsize is None:
-        wsize = 32
-        warn_with_kernel(knl, "get_mem_access_map_assumes_warpsize",
-                         "get_mem_access_map: No warp size passed, "
-                         "assuming warp size is %d."
-                         % (wsize))
+    if subgroup_size is None:
+        subgroup_size = 32
+        warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
+                         "get_mem_access_map: No subgroup size passed, "
+                         "assuming subgroup size is %d."
+                         % (subgroup_size))
 
     class CacheHolder(object):
         pass
@@ -1379,8 +1379,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             return ct
         elif count_granularity == 'thread':
             return ct
-        elif count_granularity == 'warp':
-            return ct/wsize
+        elif count_granularity == 'subgroup':
+            return ct/subgroup_size
         elif count_granularity == 'group':
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
@@ -1397,7 +1397,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         else:
             raise ValueError("get_insn_count: count_granularity '%s' is"
                     "not allowed. count_granularity must be 'group', "
-                    "'warp', or 'thread'." % (count_granularity))
+                    "'subgroup', or 'thread'." % (count_granularity))
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index b3f4d2226667cef8f1573e86f84ef298297f8729..b93e26264ee6ef802d23150fb604a8e6d1e0b5d2 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -247,7 +247,7 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
 
     n = 512
     m = 256
@@ -255,33 +255,33 @@ def test_mem_access_counter_basic():
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64l == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32l == 3*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64l == 2*n*m/32  # /subgroup_size because these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64s == n*m/32  # /warpsize because these are considered uniform
+    assert f32s == n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64s == n*m/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_reduction():
@@ -294,26 +294,26 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*ell/32  # /warpsize because these are considered uniform
+    assert f32l == 2*n*m*ell/32  # /subgroup_size because these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*ell/32  # /warpsize because these are considered uniform
+    assert f32s == n*ell/32  # /subgroup_size because these are uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -337,7 +337,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
@@ -354,9 +354,9 @@ def test_mem_access_counter_logic():
     f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='store')
                           ].eval_with_dict(params)
-    assert f32_g_l == 2*n*m/32  # /warpsize because these are considered uniform
-    assert f64_g_l == n*m/32  # /warpsize because these are considered uniform
-    assert f64_g_s == n*m/32  # /warpsize because these are considered uniform
+    assert f32_g_l == 2*n*m/32  # /subgroup_size because these are uniform
+    assert f64_g_l == n*m/32  # /subgroup_size because these are uniform
+    assert f64_g_s == n*m/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_specialops():
@@ -373,46 +373,46 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64 == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32 == 2*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64 == 2*n*m/32  # /subgroup_size because these are uniform
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
-    assert f32 == n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64 == n*m/32  # /warpsize because these are considered uniform
+    assert f32 == n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64 == n*m/32  # /subgroup_size because these are uniform
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
-                         count_granularity='warp')
+                         count_granularity='subgroup')
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == (n*m*ell + n*m)/32  # /warpsize for uniform
+    assert tot == (n*m*ell + n*m)/32  # /subgroup_size for uniform
 
 
 def test_mem_access_counter_bitwise():
@@ -432,38 +432,38 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (4*n*m+2*n*m*ell)/32  # /warpsize for uniform
+    assert i32 == (4*n*m+2*n*m*ell)/32  # /subgroup_size for uniform
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (n*m+n*m*ell)/32  # /warpsize because these are considered uniform
+    assert i32 == (n*m+n*m*ell)/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_mixed():
@@ -484,22 +484,22 @@ def test_mem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)  # noqa
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='h',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
@@ -511,20 +511,20 @@ def test_mem_access_counter_mixed():
                                 variable='b',
                                 count_granularity='thread')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*ell/32  # /warpsize for uniform
-    assert f32uniform == n*m*ell/32  # /warpsize for uniform
+    assert f64uniform == 2*n*m*ell/32  # /subgroup_size for uniform
+    assert f32uniform == n*m*ell/32  # /subgroup_size for uniform
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
                                 count_granularity='thread')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64uniform == n*m*ell/32  # /subgroup_size because these are uniform
     assert f32nonconsec == n*m*ell
 
 
@@ -585,7 +585,8 @@ def test_mem_access_counter_nonconsec():
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
 
-    mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=64)
+    mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                      subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
@@ -843,26 +844,26 @@ def test_summations_and_filters():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
-                                count_granularity=['warp']
+                                count_granularity=['subgroup']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*ell/32  # /warpsize because these are considered uniform
+    assert loads_a == 2*n*m*ell/32  # /subgroup_size because these are uniform
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                      count_granularity=['warp']
+                                      count_granularity=['subgroup']
                                       ).eval_and_sum(params)
-    assert global_stores == (n*m*ell + n*m)/32  # /warpsize for uniform
+    assert global_stores == (n*m*ell + n*m)/32  # /subgroup_size for uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
-                                 count_granularity=['warp']
+                                 count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                 count_granularity=['warp']
+                                 count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /warpsize for uniform
-    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /warpsize for uniform
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /subgroup_size for uniform
+    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /subgroup_size for uniform
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -870,8 +871,8 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64lall == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32lall == 3*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64lall == 2*n*m/32  # /subgroup_size because these are uniform
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -904,7 +905,7 @@ def test_summations_and_filters():
         return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
-    assert s1f64l == 2*n*m/32  # /warpsize because these are considered uniform
+    assert s1f64l == 2*n*m/32  # /subgroup_size because these are uniform
 
 
 def test_strided_footprint():