diff --git a/test/test_statistics.py b/test/test_statistics.py
index 0687bff5a5aee0cf042d1a3798a83d2782ad79a0..7a5d13949ff88f369893c32df51fe834199816e5 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -31,7 +31,7 @@ import loopy as lp
 from loopy.types import to_loopy_type
 import numpy as np
 from pytools import div_ceil
-from loopy.statistics import CountGranularity as cg
+from loopy.statistics import CountGranularity as CG
 
 from pymbolic.primitives import Variable
 
@@ -56,12 +56,12 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', cg.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
@@ -83,8 +83,8 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', cg.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == n*m*ell
 
@@ -113,11 +113,11 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', cg.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
@@ -145,17 +145,17 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', cg.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', cg.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', cg.WORKITEM)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', cg.WORKITEM)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32div == 2*n*m*ell
     assert f32mul == f32add == n*m*ell
@@ -185,15 +185,15 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', cg.WORKITEM)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', cg.WORKITEM)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', cg.WORKITEM)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', cg.WORKITEM)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM)
                       ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
@@ -226,7 +226,7 @@ def test_op_counter_triangular_domain():
     op_map = lp.get_op_map(
                     knl,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', cg.WORKITEM)]
+                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -267,19 +267,19 @@ def test_mem_access_counter_basic():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -288,11 +288,11 @@ def test_mem_access_counter_basic():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -326,11 +326,11 @@ def test_mem_access_counter_reduction():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -338,7 +338,7 @@ def test_mem_access_counter_reduction():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -428,19 +428,19 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -449,11 +449,11 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -461,7 +461,7 @@ def test_mem_access_counter_specialops():
     assert f64 == (n*m)*n_groups*subgroups_per_group
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -500,19 +500,19 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -520,11 +520,11 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -565,25 +565,25 @@ def test_mem_access_counter_mixed():
                                     subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='h',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='a',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='b',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -609,12 +609,12 @@ def test_mem_access_counter_mixed():
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -654,22 +654,22 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='g',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='h',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='a',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='b',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -677,12 +677,12 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='store',
                                 variable='e',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m')*Variable('ell'),
                                 direction='store', variable='c',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -693,13 +693,13 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='g',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='h',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
@@ -707,7 +707,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='a',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='b',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -745,30 +745,30 @@ def test_mem_access_counter_consec():
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -777,9 +777,9 @@ def test_mem_access_counter_consec():
 def test_count_granularity_val_checks():
 
     try:
-        lp.MemAccess(count_granularity=cg.WORKITEM)
-        lp.MemAccess(count_granularity=cg.SUBGROUP)
-        lp.MemAccess(count_granularity=cg.GROUP)
+        lp.MemAccess(count_granularity=CG.WORKITEM)
+        lp.MemAccess(count_granularity=CG.SUBGROUP)
+        lp.MemAccess(count_granularity=CG.GROUP)
         lp.MemAccess(count_granularity=None)
         assert True
         lp.MemAccess(count_granularity='bushel')
@@ -788,9 +788,9 @@ def test_count_granularity_val_checks():
         assert True
 
     try:
-        lp.Op(count_granularity=cg.WORKITEM)
-        lp.Op(count_granularity=cg.SUBGROUP)
-        lp.Op(count_granularity=cg.GROUP)
+        lp.Op(count_granularity=CG.WORKITEM)
+        lp.Op(count_granularity=CG.SUBGROUP)
+        lp.Op(count_granularity=CG.GROUP)
         lp.Op(count_granularity=None)
         assert True
         lp.Op(count_granularity='bushel')
@@ -877,16 +877,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', cg.WORKITEM)
+                        lp.Op(np.float32, 'mul', CG.WORKITEM)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', cg.WORKITEM)
+                        lp.Op(np.float32, 'add', CG.WORKITEM)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', cg.WORKITEM)
+                        lp.Op(np.int32, 'add', CG.WORKITEM)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', cg.WORKITEM)
+                        lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM)
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*ell*2
@@ -896,11 +896,11 @@ def test_all_counters_parallel_matmul():
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='load', variable='b',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='load', variable='a',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -908,7 +908,7 @@ def test_all_counters_parallel_matmul():
 
     f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='store', variable='c',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -918,7 +918,7 @@ def test_all_counters_parallel_matmul():
                         subgroup_size=32).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
-                                             count_granularity=cg.WORKITEM)
+                                             count_granularity=CG.WORKITEM)
                                 ].eval_with_dict(params)
     assert local_mem_l == n*m*ell*2
 
@@ -985,24 +985,24 @@ def test_summations_and_filters():
                                     subgroup_size=subgroup_size)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
-                                count_granularity=[cg.SUBGROUP]
+                                count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                      count_granularity=[cg.SUBGROUP]
+                                      count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
-                                 count_granularity=[cg.SUBGROUP]
+                                 count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                 count_granularity=[cg.SUBGROUP]
+                                 count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group