diff --git a/loopy/statistics.py b/loopy/statistics.py
index c273edd54a364096ad74685d6c9a1549a3d4ef57..5faeb12e3bb8f6f600c01cb2d50f43895b7f2bb9 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -105,6 +105,18 @@ def stringify_stats_mapping(m):
     return result
 
 
+class DataAccess:
+
+    def __init__(self, stride=0):
+        self.stride = stride
+
+    def __eq__(self, other):
+        return isinstance(other, DataAccess) and other.stride == self.stride  #TODO is this okay?
+
+    def __hash__(self):
+        return hash(self.stride)
+
+
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CombineMapper):
@@ -277,67 +289,63 @@ class GlobalSubscriptCounter(CombineMapper):
         from loopy.symbolic import get_dependencies
         from loopy.kernel.data import LocalIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
-        local_id0 = None
+
+        # find min tag axis
+        import sys
+        min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
-            # find local id0
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                if tag.axis == 0:
-                    local_id0 = iname
-                    break  # there will be only one local_id0
+                if tag.axis < min_tag_axis:
+                    min_tag_axis = tag.axis
 
         if not local_id_found:
             # count as uniform access
             return ToCountMap(
-                    {(self.type_inf(expr), 'uniform'): 1}
+                    {(self.type_inf(expr), DataAccess(stride=0)): 1}
                     ) + self.rec(expr.index)
 
-        if local_id0 is None:
-            # only non-zero local id(s) found, assume non-consecutive access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'nonconsecutive'): 1}
-                    ) + self.rec(expr.index)
+        # get local_id associated with minimum tag axis
+        min_local_id = None
+        for iname in my_inames:
+            tag = self.knl.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                if tag.axis == min_tag_axis:
+                    min_local_id = iname
+                    break  # there will be only one min local_id
+
+        # found local_id associated with minimum tag axis
 
-        # check coefficient of local_id0 for each axis
+        total_stride = None
+        # check coefficient of min_local_id for each axis
         from loopy.symbolic import CoefficientCollector
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
 
             coeffs = CoefficientCollector()(idx)
-            # check if he contains the lid 0 guy
+            # check if he contains the min lid guy
             try:
-                coeff_id0 = coeffs[Variable(local_id0)]
+                coeff_min_lid = coeffs[Variable(min_local_id)]
             except KeyError:
-                # does not contain local_id0
+                # does not contain min_local_id
                 continue
 
-            if coeff_id0 != 1:
-                # non-consecutive access
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # coefficient is 1, now determine if stride is 1
+            # found coefficient of min_local_id
+            # now determine stride
             from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
                 continue
 
-            if stride != 1:
-                # non-consecutive
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # else, stride == 1, continue since another idx could contain id0
+            total_stride = stride*coeff_min_lid
+            #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
-        # loop finished without returning, stride==1 for every instance of local_id0
-        return ToCountMap(
-                {(self.type_inf(expr), 'consecutive'): 1}
-                ) + self.rec(expr.index)
+        return ToCountMap({(self.type_inf(expr),
+                           DataAccess(stride=total_stride)): 1}
+                          ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -717,13 +725,13 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
-            if key[1] == "uniform":
+            if key[1].stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
         for key in subs_assignee.dict:
             poly = ToCountMap({key: subs_assignee.dict[key]})
-            if key[1] == "uniform":
+            if key[1].stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 2cf537f5ed9c039d09cb1d10066ec9294898d9b9..6e5b6270be571ae7c66e2219bcafdad0d4b63efd 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,9 +28,15 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
-from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
+from loopy.statistics import (
+        get_op_poly,
+        get_gmem_access_poly,
+        get_barrier_poly,
+        DataAccess)
+
 import numpy as np
 
+from pymbolic.primitives import Variable
 
 def test_op_counter_basic():
 
@@ -227,19 +233,19 @@ def test_gmem_access_counter_basic():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                    ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                    ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                    ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                    ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -261,12 +267,12 @@ def test_gmem_access_counter_reduction():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f32 == n*l
 
@@ -289,16 +295,16 @@ def test_gmem_access_counter_logic():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f64 == n*m
 
@@ -323,19 +329,19 @@ def test_gmem_access_counter_specialops():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -364,12 +370,12 @@ def test_gmem_access_counter_bitwise():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'load')
+                    (np.dtype(np.int32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
     i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'store')
+                    (np.dtype(np.int32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
@@ -398,23 +404,23 @@ def test_gmem_access_counter_mixed():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f32uniform = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
                     ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store')
                     ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
@@ -442,19 +448,19 @@ def test_gmem_access_counter_nonconsec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load')
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
     f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store')
                     ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
@@ -482,19 +488,19 @@ def test_gmem_access_counter_consec():
     params = {'n': n, 'm': m, 'l': l}
 
     f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=1), 'load')
                     ].eval_with_dict(params)
     f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=1), 'load')
                     ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
     f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=1), 'store')
                     ].eval_with_dict(params)
     f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=1), 'store')
                     ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
@@ -588,17 +594,17 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
-                        (np.dtype(np.float32), 'nonconsecutive', 'load')
+                        (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
                         ].eval_with_dict(params)
     f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'load')
+                        (np.dtype(np.float32), DataAccess(stride=1), 'load')
                         ].eval_with_dict(params)
 
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
 
     f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'store')
+                        (np.dtype(np.float32), DataAccess(stride=1), 'store')
                         ].eval_with_dict(params)
 
     assert f32coal == n*l