diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 4efc13de4bc93b1024ad4ccd40d4d1ef5395643b..af8c8281cf7f5b7bbf0f00a0841a15c5f05f14dd 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1661,8 +1661,13 @@ Each line of output will look roughly like::
 - dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
   data type accessed.
 
-- stride: An :class:`int` that specifies stride of the memory access. A stride
-  of 0 indicates a uniform access (i.e. all work-items access the same item).
+- stride: A :class:`dict` of **{** :class:`int` **:**
+  :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies
+  local strides for each local id in the memory access index. Local ids not
+  found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
+  work-items within a sub-group access the same item) is indicated by setting
+  ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in
+  which case the 0 key will not be present in lid_strides.
 
 - direction: A :class:`str` that specifies the direction of memory access as
   **load** or **store**.
@@ -1674,13 +1679,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, 'load', 'g', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, 'store', 'e', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, 'load', 'a', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, 'store', 'c', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1698,7 +1703,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
@@ -1716,12 +1721,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
 The lines of output above might look like::
 
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
@@ -1730,7 +1735,7 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 
 Since we have not tagged any of the inames or parallelized the kernel across
 work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
-considers the memory accesses *uniform*, so the *stride* of each access is 0.
+finds no local id strides, leaving ``lid_strides`` empty for each memory access.
 Now we'll parallelize the kernel and count the array accesses again. The
 resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
@@ -1740,12 +1745,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), 1, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 1, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 1, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 1, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, a, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, b, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, store, c, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, g, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, h, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, store, e, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1755,13 +1760,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1772,8 +1777,8 @@ array accesses has not changed:
 
 ~~~~~~~~~~~
 
-To produce *nonconsecutive* array accesses with stride greater than 1, we'll
-switch the inner and outer tags in our parallelization of the kernel:
+To produce *nonconsecutive* array accesses with local id 0 stride greater than 1,
+we'll switch the inner and outer tags in our parallelization of the kernel:
 
 .. doctest::
 
@@ -1781,12 +1786,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), 128, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 128, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 128, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), 128, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, a, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, b, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, store, c, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, g, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, h, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, store, e, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1795,13 +1800,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1819,7 +1824,7 @@ criteria are more complicated than a simple list of allowable values:
     >>> def f(key):
     ...     from loopy.types import to_loopy_type
     ...     return key.dtype == to_loopy_type(np.float32) and \
-    ...            key.stride > 1
+    ...            key.lid_strides[0] > 1
     >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
     >>> print(count)
     2097152
diff --git a/loopy/statistics.py b/loopy/statistics.py
index e27a0f482885658888c97081e4fc1d97fcd149fd..5e929b61830ef3dc286728ee63a659a0b16d19c3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -269,7 +269,7 @@ class ToCountMap(object):
             params = {'n': 512, 'm': 256, 'l': 128}
             mem_map = lp.get_mem_access_map(knl)
             def filter_func(key):
-                return key.stride > 1 and key.stride <= 4:
+                return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4:
 
             filtered_map = mem_map.filter_by_func(filter_func)
             tot = filtered_map.eval_and_sum(params)
@@ -374,16 +374,16 @@ class ToCountMap(object):
             params = {'n': 512, 'm': 256, 'l': 128}
 
             s1_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], stride=[1],
+                                mtype=['global'], lid_strides={0: 1},
                                 direction=['load']).eval_and_sum(params)
             s2_g_ld_byt = bytes_map.filter_by(
-                                mtype=['global'], stride=[2],
+                                mtype=['global'], lid_strides={0: 2},
                                 direction=['load']).eval_and_sum(params)
             s1_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], stride=[1],
+                                mtype=['global'], lid_strides={0: 1},
                                 direction=['store']).eval_and_sum(params)
             s2_g_st_byt = bytes_map.filter_by(
-                                mtype=['global'], stride=[2],
+                                mtype=['global'], lid_strides={0: 2},
                                 direction=['store']).eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -440,7 +440,7 @@ class ToCountMap(object):
             params = {'n': 512, 'm': 256, 'l': 128}
             mem_map = lp.get_mem_access_map(knl)
             filtered_map = mem_map.filter_by(direction=['load'],
-                                             variable=['a','g'])
+                                             variable=['a', 'g'])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to, e.g., predict performance)
@@ -529,7 +529,7 @@ class Op(Record):
                             count_granularity=count_granularity)
 
     def __hash__(self):
-        return hash(str(self))
+        return hash(repr(self))
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
@@ -553,10 +553,16 @@ class MemAccess(Record):
        A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
        data type accessed.
 
-    .. attribute:: stride
+    .. attribute:: lid_strides
 
-       An :class:`int` that specifies stride of the memory access. A stride of
-       0 indicates a uniform access (i.e. all work-items access the same item).
+       A :class:`dict` of **{** :class:`int` **:**
+       :class:`pymbolic.primitives.Expression` or :class:`int` **}** that
+       specifies local strides for each local id in the memory access index.
+       Local ids not found will not be present in ``lid_strides.keys()``.
+       Uniform access (i.e. work-items within a sub-group access the same
+       item) is indicated by setting ``lid_strides[0]=0``, but may also occur
+       when no local id 0 is found, in which case the 0 key will not be
+       present in lid_strides.
 
     .. attribute:: direction
 
@@ -583,12 +589,12 @@ class MemAccess(Record):
 
     """
 
-    def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
+    def __init__(self, mtype=None, dtype=None, lid_strides=None, direction=None,
                  variable=None, count_granularity=None):
 
-        #TODO currently giving all lmem access stride=None
-        if (mtype == 'local') and (stride is not None):
-            raise NotImplementedError("MemAccess: stride must be None when "
+        #TODO currently giving all lmem access lid_strides=None
+        if mtype == 'local' and lid_strides is not None:
+            raise NotImplementedError("MemAccess: lid_strides must be None when "
                                       "mtype is 'local'")
 
         #TODO currently giving all lmem access variable=None
@@ -602,24 +608,26 @@ class MemAccess(Record):
                     % (count_granularity, CountGranularity.ALL+[None]))
 
         if dtype is None:
-            Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride,
+            Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides,
                             direction=direction, variable=variable,
                             count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
             Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
-                            stride=stride, direction=direction, variable=variable,
-                            count_granularity=count_granularity)
+                            lid_strides=lid_strides, direction=direction,
+                            variable=variable, count_granularity=count_granularity)
 
     def __hash__(self):
-        return hash(str(self))
+        # Note that this means lid_strides must be sorted in self.__repr__()
+        return hash(repr(self))
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
         return "MemAccess(%s, %s, %s, %s, %s, %s)" % (
             self.mtype,
             self.dtype,
-            self.stride,
+            None if self.lid_strides is None else dict(
+                sorted(six.iteritems(self.lid_strides))),
             self.direction,
             self.variable,
             self.count_granularity)
@@ -870,7 +878,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
             return ToCountMap()
 
         return ToCountMap({MemAccess(mtype='global',
-                                     dtype=self.type_inf(expr), stride=0,
+                                     dtype=self.type_inf(expr), lid_strides={},
                                      variable=name,
                                      count_granularity=CountGranularity.WORKITEM): 1}
                           ) + self.rec(expr.index)
@@ -896,86 +904,81 @@ class GlobalMemAccessCounter(MemAccessCounter):
         from loopy.kernel.data import LocalIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
-        # find min tag axis
-        import sys
-        min_tag_axis = sys.maxsize
-        local_id_found = False
+        # find all local index tags and corresponding inames
+        lid_to_iname = {}
         for iname in my_inames:
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
-                local_id_found = True
-                if tag.axis < min_tag_axis:
-                    min_tag_axis = tag.axis
+                lid_to_iname[tag.axis] = iname
+
+        if not lid_to_iname:
+
+            # no local id found, count as uniform access
+            # Note, a few different cases may be considered uniform:
+            # lid_strides={} if no local ids were found,
+            # lid_strides={1:1, 2:32} if no local id 0 was found,
+            # lid_strides={0:0, ...} if a local id 0 is found and its stride is 0
+            warn_with_kernel(self.knl, "no_lid_found",
+                             "GlobalSubscriptCounter: No local id found, "
+                             "setting lid_strides to {}. Expression: %s"
+                             % (expr))
 
-        if not local_id_found:
-            # count as uniform access
             return ToCountMap({MemAccess(
                                 mtype='global',
-                                dtype=self.type_inf(expr), stride=0,
+                                dtype=self.type_inf(expr), lid_strides={},
                                 variable=name,
                                 count_granularity=CountGranularity.SUBGROUP): 1}
                               ) + self.rec(expr.index)
 
-        if min_tag_axis != 0:
-            warn_with_kernel(self.knl, "unknown_gmem_stride",
-                             "GlobalSubscriptCounter: Memory access minimum "
-                             "tag axis %d != 0, stride unknown, using "
-                             "sys.maxsize." % (min_tag_axis))
-            return ToCountMap({MemAccess(
-                                mtype='global',
-                                dtype=self.type_inf(expr),
-                                stride=sys.maxsize, variable=name,
-                                count_granularity=CountGranularity.WORKITEM): 1}
-                              ) + self.rec(expr.index)
+        # create lid_strides dict (strides are coefficents in flattened index)
+        # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C
+        # come from flattened index [... + C*lid2 + B*lid1 + A*lid0]
 
-        # get local_id associated with minimum tag axis
-        min_lid = None
-        for iname in my_inames:
-            tag = self.knl.iname_to_tag.get(iname)
-            if isinstance(tag, LocalIndexTag):
-                if tag.axis == min_tag_axis:
-                    min_lid = iname
-                    break  # there will be only one min local_id
-
-        # found local_id associated with minimum tag axis
-
-        total_stride = 0
-        # check coefficient of min_lid for each axis
         from loopy.symbolic import CoefficientCollector
         from loopy.kernel.array import FixedStrideArrayDimTag
         from pymbolic.primitives import Variable
-        for idx, axis_tag in zip(index, array.dim_tags):
 
-            from loopy.symbolic import simplify_using_aff
-            from loopy.diagnostic import ExpressionNotAffineError
-            try:
-                coeffs = CoefficientCollector()(
-                          simplify_using_aff(self.knl, idx))
-            except ExpressionNotAffineError:
-                total_stride = None
-                break
-            # check if he contains the lid 0 guy
-            try:
-                coeff_min_lid = coeffs[Variable(min_lid)]
-            except KeyError:
-                # does not contain min_lid
-                continue
-            # found coefficient of min_lid
-            # now determine stride
-            if isinstance(axis_tag, FixedStrideArrayDimTag):
-                stride = axis_tag.stride
-            else:
-                continue
+        lid_strides = {}
+
+        for ltag, iname in six.iteritems(lid_to_iname):
+            ltag_stride = 0
+            # check coefficient of this lid for each axis
+            for idx, axis_tag in zip(index, array.dim_tags):
+
+                from loopy.symbolic import simplify_using_aff
+                from loopy.diagnostic import ExpressionNotAffineError
+                try:
+                    coeffs = CoefficientCollector()(
+                              simplify_using_aff(self.knl, idx))
+                except ExpressionNotAffineError:
+                    ltag_stride = None
+                    break
+
+                # check if idx contains this lid
+                try:
+                    coeff_lid = coeffs[Variable(lid_to_iname[ltag])]
+                except KeyError:
+                    # idx does not contain this lid
+                    continue
+
+                # found coefficient of this lid
+                # now determine stride
+                if isinstance(axis_tag, FixedStrideArrayDimTag):
+                    stride = axis_tag.stride
+                else:
+                    continue
 
-            total_stride += stride*coeff_min_lid
+                ltag_stride += stride*coeff_lid
+            lid_strides[ltag] = ltag_stride
 
-        count_granularity = CountGranularity.WORKITEM if total_stride is not 0 \
-                                else CountGranularity.SUBGROUP
+        count_granularity = CountGranularity.WORKITEM if (
+                                0 in lid_strides and lid_strides[0] != 0
+                                ) else CountGranularity.SUBGROUP
 
         return ToCountMap({MemAccess(
                             mtype='global',
                             dtype=self.type_inf(expr),
-                            stride=total_stride,
+                            lid_strides=dict(sorted(six.iteritems(lid_strides))),
                             variable=name,
                             count_granularity=count_granularity
                             ): 1}
@@ -1386,7 +1389,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         f32_s1_g_ld_a = mem_map[MemAccess(
                                     mtype='global',
                                     dtype=np.float32,
-                                    stride=1,
+                                    lid_strides={0: 1},
                                     direction='load',
                                     variable='a',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1394,7 +1397,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         f32_s1_g_st_a = mem_map[MemAccess(
                                     mtype='global',
                                     dtype=np.float32,
-                                    stride=1,
+                                    lid_strides={0: 1},
                                     direction='store',
                                     variable='a',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1402,7 +1405,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         f32_s1_l_ld_x = mem_map[MemAccess(
                                     mtype='local',
                                     dtype=np.float32,
-                                    stride=1,
+                                    lid_strides={0: 1},
                                     direction='load',
                                     variable='x',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1410,7 +1413,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         f32_s1_l_st_x = mem_map[MemAccess(
                                     mtype='local',
                                     dtype=np.float32,
-                                    stride=1,
+                                    lid_strides={0: 1},
                                     direction='store',
                                     variable='x',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1532,14 +1535,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                     + access_counter_l(insn.assignee)
                     ).with_set_attributes(direction="store")
 
-            # use count excluding local index tags for uniform accesses
             for key, val in six.iteritems(access_expr.count_map):
 
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
                         * get_insn_count(knl, insn.id, key.count_granularity))
-                #currently not counting stride of local mem access
 
             for key, val in six.iteritems(access_assignee.count_map):
 
@@ -1547,7 +1548,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                         access_map
                         + ToCountMap({key: val})
                         * get_insn_count(knl, insn.id, key.count_granularity))
-                # for now, don't count writes to local mem
+
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
         else:
@@ -1560,7 +1561,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                         (MemAccess(
                             mtype=mem_access.mtype,
                             dtype=mem_access.dtype.numpy_dtype,
-                            stride=mem_access.stride,
+                            lid_strides=mem_access.lid_strides,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
                             count_granularity=mem_access.count_granularity),
diff --git a/test/test_statistics.py b/test/test_statistics.py
index ea0bdb62bb75d8a5bcf7dd987c00c33b848091fd..e42c43f60179321114fb695978cc1c91f182e8ee 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -269,19 +269,19 @@ def test_mem_access_counter_basic():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a',
+                         lid_strides={}, direction='load', variable='a',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='b',
+                         lid_strides={}, direction='load', variable='b',
                          count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
-                         stride=0, direction='load', variable='g',
+                         lid_strides={}, direction='load', variable='g',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
-                         stride=0, direction='load', variable='h',
+                         lid_strides={}, direction='load', variable='h',
                          count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
@@ -290,11 +290,11 @@ def test_mem_access_counter_basic():
     assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         stride=0, direction='store', variable='c',
+                         lid_strides={}, direction='store', variable='c',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         stride=0, direction='store', variable='e',
+                         lid_strides={}, direction='store', variable='e',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -328,11 +328,11 @@ def test_mem_access_counter_reduction():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a',
+                         lid_strides={}, direction='load', variable='a',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='b',
+                         lid_strides={}, direction='load', variable='b',
                          count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
@@ -340,7 +340,7 @@ def test_mem_access_counter_reduction():
     assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         stride=0, direction='store', variable='c',
+                         lid_strides={}, direction='store', variable='c',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -430,19 +430,19 @@ def test_mem_access_counter_specialops():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a',
+                         lid_strides={}, direction='load', variable='a',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='b',
+                         lid_strides={}, direction='load', variable='b',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         stride=0, direction='load', variable='g',
+                         lid_strides={}, direction='load', variable='g',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         stride=0, direction='load', variable='h',
+                         lid_strides={}, direction='load', variable='h',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -451,11 +451,11 @@ def test_mem_access_counter_specialops():
     assert f64 == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='store', variable='c',
+                         lid_strides={}, direction='store', variable='c',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
-                         stride=0, direction='store', variable='e',
+                         lid_strides={}, direction='store', variable='e',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
@@ -502,19 +502,19 @@ def test_mem_access_counter_bitwise():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='load', variable='a',
+                         lid_strides={}, direction='load', variable='a',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='load', variable='b',
+                         lid_strides={}, direction='load', variable='b',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='load', variable='g',
+                         lid_strides={}, direction='load', variable='g',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
-                         stride=0, direction='load', variable='h',
+                         lid_strides={}, direction='load', variable='h',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -522,11 +522,11 @@ def test_mem_access_counter_bitwise():
     assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='store', variable='c',
+                         lid_strides={}, direction='store', variable='c',
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='store', variable='e',
+                         lid_strides={}, direction='store', variable='e',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
@@ -567,24 +567,24 @@ def test_mem_access_counter_mixed():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                stride=0, direction='load', variable='g',
+                                lid_strides={}, direction='load', variable='g',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
-                                stride=0, direction='load', variable='h',
+                                lid_strides={}, direction='load', variable='h',
                                 count_granularity=CG.SUBGROUP)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
-                                stride=0, direction='load', variable='x',
+                                lid_strides={}, direction='load', variable='x',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                stride=Variable('m'), direction='load',
+                                lid_strides={0: Variable('m')}, direction='load',
                                 variable='a',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                stride=Variable('m'), direction='load',
+                                lid_strides={0: Variable('m')}, direction='load',
                                 variable='b',
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
@@ -611,11 +611,11 @@ def test_mem_access_counter_mixed():
         assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                stride=0, direction='store', variable='e',
+                                lid_strides={}, direction='store', variable='e',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                stride=Variable('m'), direction='store',
+                                lid_strides={0: Variable('m')}, direction='store',
                                 variable='c',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
@@ -655,22 +655,22 @@ def test_mem_access_counter_nonconsec():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                stride=Variable('m'), direction='load',
+                                lid_strides={0: Variable('m')}, direction='load',
                                 variable='g',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
-                                stride=Variable('m'), direction='load',
+                                lid_strides={0: Variable('m')}, direction='load',
                                 variable='h',
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                stride=Variable('m')*Variable('ell'),
+                                lid_strides={0: Variable('m')*Variable('ell')},
                                 direction='load', variable='a',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                stride=Variable('m')*Variable('ell'),
+                                lid_strides={0: Variable('m')*Variable('ell')},
                                 direction='load', variable='b',
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
@@ -678,12 +678,12 @@ def test_mem_access_counter_nonconsec():
     assert f32nonconsec == 3*n*m*ell
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                stride=Variable('m'), direction='store',
+                                lid_strides={0: Variable('m')}, direction='store',
                                 variable='e',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                stride=Variable('m')*Variable('ell'),
+                                lid_strides={0: Variable('m')*Variable('ell')},
                                 direction='store', variable='c',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
@@ -694,20 +694,20 @@ def test_mem_access_counter_nonconsec():
                                       subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
                     'global',
-                    np.float64, stride=Variable('m'),
+                    np.float64, lid_strides={0: Variable('m')},
                     direction='load', variable='g',
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
-                    np.float64, stride=Variable('m'),
+                    np.float64, lid_strides={0: Variable('m')},
                     direction='load', variable='h',
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
                     np.dtype(np.float32),
-                    stride=Variable('m')*Variable('ell'),
+                    lid_strides={0: Variable('m')*Variable('ell')},
                     direction='load',
                     variable='a',
                     count_granularity=CG.WORKITEM)
@@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec():
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.dtype(np.float32),
-                    stride=Variable('m')*Variable('ell'),
+                    lid_strides={0: Variable('m')*Variable('ell')},
                     direction='load',
                     variable='b',
                     count_granularity=CG.WORKITEM)
@@ -747,30 +747,30 @@ def test_mem_access_counter_consec():
     params = {'n': n, 'm': m, 'ell': ell}
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='load', variable='g',
+                        lid_strides={0: 1}, direction='load', variable='g',
                         count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='load', variable='h',
+                        lid_strides={0: 1}, direction='load', variable='h',
                         count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='load', variable='a',
+                        lid_strides={0: 1}, direction='load', variable='a',
                         count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                        stride=1, direction='load', variable='b',
+                        lid_strides={0: 1}, direction='load', variable='b',
                         count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='store', variable='e',
+                        lid_strides={0: 1}, direction='store', variable='e',
                         count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='store', variable='c',
+                        lid_strides={0: 1}, direction='store', variable='c',
                         count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
@@ -853,7 +853,6 @@ def test_barrier_counter_barriers():
 
 
 def test_all_counters_parallel_matmul():
-
     bsize = 16
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
@@ -898,19 +897,21 @@ def test_all_counters_parallel_matmul():
                                            subgroup_size=32)
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
-                             stride=1, direction='load', variable='b',
+                             lid_strides={0: 1, 1: Variable('ell')},
+                             direction='load', variable='b',
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             stride=1, direction='load', variable='a',
-                             count_granularity=CG.WORKITEM)
+                             lid_strides={0: 1, 1: Variable('m')}, direction='load',
+                             variable='a', count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
     f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                             stride=1, direction='store', variable='c',
+                             lid_strides={0: 1, 1: Variable('ell')},
+                             direction='store', variable='c',
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
@@ -1057,12 +1058,12 @@ def test_summations_and_filters():
     assert f64ops_all == n*m
 
     def func_filter(key):
-        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
+        return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
                key.direction == 'load'
-    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
+    f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert s1f64l == (2*n*m)*n_workgroups*subgroups_per_group
+    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
 
 
 def test_strided_footprint():