diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 29fab76e7af1d0a0dfa548a056a36273cf553b38..06756df56fb370ab0b8f05b0273510bbf739437b 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1243,26 +1243,30 @@ Obtaining Performance Statistics
 
 .. {{{
 
-Operations, array access, and barriers can all be counted, which may facilitate
-performance prediction and optimization of a :mod:`loopy` kernel.
+Arithmetic operations, array accesses, and synchronization operations can all
+be counted, which may facilitate performance prediction and optimization of a
+:mod:`loopy` kernel.
 
 .. note::
 
     The functions used in the following examples may produce warnings. If you have
     already made the filterwarnings and catch_warnings calls used in the examples
-    above, you may need to reset these before continuing:
+    above, you may want to reset these before continuing. We will temporarily
+    supress warnings to keep the output clean:
 
     .. doctest::
 
-        >>> from warnings import resetwarnings
+        >>> from warnings import resetwarnings, filterwarnings
         >>> resetwarnings()
+        >>> filterwarnings('ignore', category=Warning)
 
 Counting operations
 ~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_op_poly` provides information on the number and type of operations
-being performed in a kernel. To demonstrate this, we'll create an example kernel
-that performs several operations on arrays containing different types of data:
+:func:`loopy.get_op_map` provides information on the characteristics and
+quantity of arithmetic operations being performed in a kernel. To demonstrate
+this, we'll create an example kernel that performs several operations on arrays
+containing different types of data:
 
 .. doctest::
 
@@ -1280,37 +1284,41 @@ information provided. Now we will count the operations:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_op_poly
-    >>> op_map = get_op_poly(knl)
+    >>> op_map = lp.get_op_map(knl)
+    >>> print(lp.stringify_stats_mapping(op_map))
+    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
 
-:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** 
-:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The 
-:class:`islpy.PwQPolynomial` holds the number of operations for the type specified 
-in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this 
-map now:
+:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
+:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an
+arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the
+number of operations matching the characteristics of the :class:`loopy.Op`
+specified in the key (in terms of the :class:`loopy.LoopKernel`
+*inames*). :class:`loopy.Op` attributes include:
 
-.. doctest::
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type operated on.
 
-    >>> print(lp.stringify_stats_mapping(op_map))
-    (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    <BLANKLINE>
+- name: A :class:`str` that specifies the kind of arithmetic operation as
+  *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
 
-We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
-    >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
-    >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
-    >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1320,174 +1328,238 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
     65536
     65536
 
-Counting array accesses
-~~~~~~~~~~~~~~~~~~~~~~~
+:class:`loopy.ToCountMap` provides member functions that facilitate filtering,
+grouping, and evaluating subsets of the counts. Suppose we want to know the
+total number of 32-bit operations of any kind. We can easily count these
+using functions :func:`loopy.ToCountMap.filter_by` and
+:func:`loopy.ToCountMap.eval_and_sum`:
+
+.. doctest::
 
-:func:`loopy.get_gmem_access_poly` provides information on the number and type of
-array loads and stores being performed in a kernel. To demonstrate this, we'll
-continue using the kernel from the previous example:
+    >>> filtered_op_map = op_map.filter_by(dtype=[np.float32])
+    >>> f32op_count = filtered_op_map.eval_and_sum(param_dict)
+    >>> print(f32op_count)
+    1572864
+
+We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`,
+which produces a :class:`loopy.ToCountMap` that contains the same counts grouped
+together into keys containing only the specified fields:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_gmem_access_poly
-    >>> load_store_map = get_gmem_access_poly(knl)
-    >>> print(lp.stringify_stats_mapping(load_store_map))
-    (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    >>> op_map_dtype = op_map.group_by('dtype')
+    >>> print(lp.stringify_stats_mapping(op_map_dtype))
+    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
     <BLANKLINE>
+    >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
+    ...                           ].eval_with_dict(param_dict)
+    >>> print(f32op_count)
+    1572864
+
+See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
+more information on these functions.
+
+Counting memory accesses
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`loopy.get_mem_access_map` provides information on the number and
+characteristics of memory accesses performed in a kernel. To demonstrate this,
+we'll continue using the kernel from the previous example:
+
+.. doctest::
+
+    >>> mem_map = lp.get_mem_access_map(knl)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+
+:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
+:class:`loopy.MemAccess` attributes include:
 
-:func:`loopy.get_gmem_access_poly` returns a mapping of **{(**
-:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)**
-**:** :class:`islpy.PwQPolynomial` **}**.
+- mtype: A :class:`str` that specifies the memory type accessed as **global**
+  or **local**
 
-- The :class:`numpy.dtype` specifies the type of the data being accessed.
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type accessed.
 
-- The first string in the map key specifies the DRAM access type as *consecutive*,
-  *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when
-  consecutive threads access consecutive array elements in memory, *nonconsecutive*
-  accesses occur when consecutive threads access nonconsecutive array elements in
-  memory, and *uniform* accesses occur when consecutive threads access the *same*
-  element in memory.
+- stride: An :class:`int` that specifies stride of the memory access. A stride
+  of 0 indicates a uniform access (i.e. all threads access the same item).
 
-- The second string in the map key specifies the DRAM access type as a *load*, or a
-  *store*.
+- direction: A :class:`str` that specifies the direction of memory access as
+  **load** or **store**.
 
-- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the
-  characteristics specified in the key (in terms of the :class:`loopy.LoopKernel`
-  *inames*).
+- variable: A :class:`str` that specifies the variable name of the data
+  accessed.
 
 We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
+
+:class:`loopy.ToCountMap` also makes it easy to determine the total amount
+of data moved in bytes. Suppose we want to know the total abount of global
+memory data loaded and stored. We can produce a map with just this information
+using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
+
+.. doctest::
+
+    >>> bytes_map = mem_map.to_bytes()
+    >>> print(lp.stringify_stats_mapping(bytes_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
+    ...                                         ).group_by('direction')
+    >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
+    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
+    ...                            ].eval_with_dict(param_dict)
+    >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
+    ...                            ].eval_with_dict(param_dict)
+    >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
+    bytes loaded: 7340032
+    bytes stored: 2621440
+
+One can see how these functions might be useful in computing, for example,
+achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 
 ~~~~~~~~~~~
 
-Since we have not tagged any of the inames or parallelized the kernel across threads
-(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers
-the array accesses *uniform*. Now we'll parallelize the kernel and count the array
-accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
-this time, so we'll print the mapping manually to make it more legible:
+Since we have not tagged any of the inames or parallelized the kernel across
+threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
+considers the memory accesses *uniform*, so the *stride* of each access is 0.
+Now we'll parallelize the kernel and count the array accesses again. The
+resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
 .. doctest::
 
-    >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
-    >>> load_store_map = get_gmem_access_poly(knl_consec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
+    >>> knl_consec = lp.split_iname(knl, "k", 128,
+    ...                             outer_tag="l.1", inner_tag="l.0")
+    >>> mem_map = lp.get_mem_access_map(knl_consec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
 
-
 With this parallelization, consecutive threads will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
-parallelization, but when we evaluate them, we see that the total number of array
-accesses has not changed:
+parallelization, but when we evaluate them, we see that the total number of
+array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 ~~~~~~~~~~~
 
-To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in
-our parallelization of the kernel:
+To produce *nonconsecutive* array accesses with stride greater than 1, we'll
+switch the inner and outer tags in our parallelization of the kernel:
 
 .. doctest::
 
-    >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
-    >>> load_store_map = get_gmem_access_poly(knl_nonconsec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
+    >>> knl_nonconsec = lp.split_iname(knl, "k", 128,
+    ...                                outer_tag="l.0", inner_tag="l.1")
+    >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
 
+With this parallelization, consecutive threads will access *nonconsecutive*
+array elements in memory. The total number of array accesses still has not
+changed:
 
-With this parallelization, consecutive threads will access *nonconsecutive* array
-elements in memory. The total number of array accesses has not changed:
+.. doctest::
+
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
+    ...                  ].eval_with_dict(param_dict)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
+    ...                  ].eval_with_dict(param_dict)
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
+
+We can also filter using an arbitrary test function using
+:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter
+criteria are more complicated than a simple list of allowable values:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "load")
-    ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "store")
-    ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> def f(key):
+    ...     from loopy.types import to_loopy_type
+    ...     return key.dtype == to_loopy_type(np.float32) and \
+    ...            key.stride > 1
+    >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
+    >>> print(count)
+    2097152
 
 Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_synchronization_poly` counts the number of synchronization
+:func:`loopy.get_synchronization_map` counts the number of synchronization
 events per **thread** in a kernel. First, we'll call this function on the
 kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_synchronization_poly
-    >>> barrier_poly = get_synchronization_poly(knl)
-    >>> print(lp.stringify_stats_mapping(barrier_poly))
+    >>> sync_map = lp.get_synchronization_map(knl)
+    >>> print(lp.stringify_stats_mapping(sync_map))
     kernel_launch : { 1 }
     <BLANKLINE>
 
@@ -1495,7 +1567,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
     >>> print("Kernel launch count: %s" % launch_count)
     Kernel launch count: 1
 
@@ -1537,24 +1609,24 @@ Now to make things more interesting, we'll create a kernel with barriers:
         }
     }
 
-
-In this kernel, when a thread performs the second instruction it uses data produced
-by *different* threads during the first instruction. Because of this, barriers are
-required for correct execution, so loopy inserts them. Now we'll count the barriers
-using :func:`loopy.get_barrier_poly`:
+In this kernel, when a thread performs the second instruction it uses data
+produced by *different* threads during the first instruction. Because of this,
+barriers are required for correct execution, so loopy inserts them. Now we'll
+count the barriers using :func:`loopy.get_synchronization_map`:
 
 .. doctest::
 
-    >>> sync_map = lp.get_synchronization_poly(knl)
+    >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
     barrier_local : { 1000 }
     kernel_launch : { 1 }
     <BLANKLINE>
 
-Based on the kernel code printed above, we would expect each thread to encounter
-50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
-this case, the number of barriers does not depend on any inames, so we can pass an
-empty dictionary to :func:`islpy.eval_with_dict`.
+Based on the kernel code printed above, we would expect each thread to
+encounter 50x10x2 barriers, which matches the result from
+:func:`loopy.get_synchronization_map`. In this case, the number of barriers
+does not depend on any inames, so we can pass an empty dictionary to
+:func:`islpy.eval_with_dict`.
 
 .. }}}
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 200d871755fdf2c1cbf95db1c7d83c5b6b5441bc..110652cf75d467ceb473d4997142f4dabe3e763b 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -112,10 +112,10 @@ from loopy.transform.parameter import assume, fix_parameters
 from loopy.type_inference import infer_unknown_types
 from loopy.preprocess import preprocess_kernel, realize_reduction
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
-        get_gmem_access_poly,
-        get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
-        sum_mem_access_to_bytes,
+from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op,
+        MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
+        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
+        get_synchronization_poly, get_synchronization_map,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
@@ -221,10 +221,10 @@ __all__ = [
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
-        "get_DRAM_access_poly",
-        "get_synchronization_poly", "stringify_stats_mapping",
-        "sum_mem_access_to_bytes",
+        "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess",
+        "get_op_poly", "get_op_map", "get_lmem_access_poly",
+        "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
+        "get_synchronization_poly", "get_synchronization_map",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
diff --git a/loopy/statistics.py b/loopy/statistics.py
index a4662f8d7782bb9bbc2de263c8b8d02a649d9430..2ec5eb0d4d5e32dbd9eb201ab718078a6b36f7d8 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import six
 
 import loopy as lp
+import numpy as np
 import warnings
 from islpy import dim_type
 import islpy as isl
@@ -39,13 +40,13 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
-.. autofunction:: get_op_poly
+.. autoclass:: ToCountMap
+.. autoclass:: Op
+.. autoclass:: MemAccess
 
-.. autofunction:: get_gmem_access_poly
-
-.. autofunction:: sum_mem_access_to_bytes
-
-.. autofunction:: get_synchronization_poly
+.. autofunction:: get_op_map
+.. autofunction:: get_mem_access_map
+.. autofunction:: get_synchronization_map
 
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
@@ -55,18 +56,27 @@ __doc__ = """
 
 # {{{ ToCountMap
 
-class ToCountMap:
-    """Maps any type of key to an arithmetic type."""
+class ToCountMap(object):
+    """Maps any type of key to an arithmetic type.
+
+    .. automethod:: filter_by
+    .. automethod:: filter_by_func
+    .. automethod:: group_by
+    .. automethod:: to_bytes
+    .. automethod:: sum
+    .. automethod:: eval_and_sum
+
+    """
 
     def __init__(self, init_dict=None):
         if init_dict is None:
             init_dict = {}
-        self.dict = init_dict
+        self.count_map = init_dict
 
     def __add__(self, other):
-        result = self.dict.copy()
-        for k, v in six.iteritems(other.dict):
-            result[k] = self.dict.get(k, 0) + v
+        result = self.count_map.copy()
+        for k, v in six.iteritems(other.count_map):
+            result[k] = self.count_map.get(k, 0) + v
         return ToCountMap(result)
 
     def __radd__(self, other):
@@ -80,8 +90,8 @@ class ToCountMap:
     def __mul__(self, other):
         if isinstance(other, isl.PwQPolynomial):
             return ToCountMap(dict(
-                (index, self.dict[index]*other)
-                for index in self.dict.keys()))
+                (index, self.count_map[index]*other)
+                for index in self.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
                                 "ToCountMap by {0} {1}."
@@ -91,12 +101,262 @@ class ToCountMap:
 
     def __getitem__(self, index):
         try:
-            return self.dict[index]
+            return self.count_map[index]
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
+    def __setitem__(self, index, value):
+        self.count_map[index] = value
+
     def __repr__(self):
-        return repr(self.dict)
+        return repr(self.count_map)
+
+    def __len__(self):
+        return len(self.count_map)
+
+    def items(self):
+        return self.count_map.items()
+
+    def keys(self):
+        return self.count_map.keys()
+
+    def pop(self, item):
+        return self.count_map.pop(item)
+
+    def copy(self):
+        return ToCountMap(dict(self.count_map))
+
+    def filter_by(self, **kwargs):
+        """Remove items without specified key fields.
+
+        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
+                             the :class:`ToCountMap`, each given a list of
+                             allowable values for that key field.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the original :class:`ToCountMap` that match the field values
+                 passed.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        from loopy.types import to_loopy_type
+        if 'dtype' in kwargs.keys():
+            kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
+
+        # for each item in self.count_map
+        for self_key, self_val in self.items():
+            try:
+                # check to see if key attribute values match all filters
+                for arg_field, allowable_vals in kwargs.items():
+                    attr_val = getattr(self_key, arg_field)
+                    # see if the value is in the filter list
+                    if attr_val not in allowable_vals:
+                        break
+                else:  # loop terminated without break or error
+                    result_map[self_key] = self_val
+            except(AttributeError):
+                # the field passed is not a field of this key
+                continue
+
+        return result_map
+
+    def filter_by_func(self, func):
+        """Keep items that pass a test.
+
+        :parameter func: A function that takes a map key a parameter and
+                         returns a :class:`bool`.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the original :class:`ToCountMap` for which func(key) is true.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            def filter_func(key):
+                return key.stride > 1 and key.stride <= 4:
+
+            filtered_map = mem_map.filter_by_func(filter_func)
+            tot = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # for each item in self.count_map, call func on the key
+        for self_key, self_val in self.items():
+            if func(self_key):
+                result_map[self_key] = self_val
+
+        return result_map
+
+    def group_by(self, *args):
+        """Group map items together, distinguishing by only the key fields
+           passed in args.
+
+        :parameter \*args: Zero or more :class:`str` fields of map keys.
+
+        :return: A :class:`ToCountMap` containing the same total counts
+                 grouped together by new keys that only contain the fields
+                 specified in the arguments passed.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = get_mem_access_map(knl)
+            grouped_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+            f32_global_ld = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='load')
+                                       ].eval_with_dict(params)
+            f32_global_st = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='store')
+                                       ].eval_with_dict(params)
+            f32_local_ld = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='load')
+                                      ].eval_with_dict(params)
+            f32_local_st = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='store')
+                                      ].eval_with_dict(params)
+
+            op_map = get_op_map(knl)
+            ops_dtype = op_map.group_by('dtype')
+
+            f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params)
+            f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
+            i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # make sure all item keys have same type
+        if self.count_map:
+            key_type = type(list(self.keys())[0])
+            if not all(isinstance(x, key_type) for x in self.keys()):
+                raise ValueError("ToCountMap: group_by() function may only "
+                                 "be used on ToCountMaps with uniform keys")
+        else:
+            return result_map
+
+        # for each item in self.count_map
+        for self_key, self_val in self.items():
+            new_key = key_type()
+
+            # set all specified fields
+            for field in args:
+                setattr(new_key, field, getattr(self_key, field))
+
+            if new_key in result_map.keys():
+                result_map[new_key] += self_val
+            else:
+                result_map[new_key] = self_val
+
+        return result_map
+
+    def to_bytes(self):
+        """Convert counts to bytes using data type in map key.
+
+        :return: A :class:`ToCountMap` mapping each original key to a
+                 :class:`islpy.PwQPolynomial` with counts in bytes rather than
+                 instances.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            bytes_map = get_mem_access_map(knl).to_bytes()
+            params = {'n': 512, 'm': 256, 'l': 128}
+
+            s1_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['load']).eval_and_sum(params)
+            s2_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['load']).eval_and_sum(params)
+            s1_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['store']).eval_and_sum(params)
+            s2_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['store']).eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result = self.copy()
+
+        for key, val in self.items():
+            bytes_processed = int(key.dtype.itemsize) * val
+            result[key] = bytes_processed
+
+        return result
+
+
+    def sum(self):
+        """Add all counts in ToCountMap.
+
+        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
+
+        """
+        total = isl.PwQPolynomial('{ 0 }')
+        for k, v in self.items():
+            if not isinstance(v, isl.PwQPolynomial):
+                raise ValueError("ToCountMap: sum() encountered type {0} but "
+                                 "may only be used on PwQPolynomials."
+                                 .format(type(v)))
+            total += v
+        return total
+
+
+    def eval_and_sum(self, params):
+        """Add all counts in :class:`ToCountMap` and evaluate with provided
+        parameter dict.
+
+        :return: An :class:`int` containing the sum of all counts in the
+                 :class:`ToCountMap` evaluated with the parameters provided.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+        return self.sum().eval_with_dict(params)
 
 # }}}
 
@@ -108,6 +368,143 @@ def stringify_stats_mapping(m):
     return result
 
 
+class Op(object):
+    """An arithmetic operation.
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type operated on.
+
+    .. attribute:: name
+
+       A :class:`str` that specifies the kind of arithmetic operation as
+       *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
+    """
+
+    def __init__(self, dtype=None, name=None):
+        self.name = name
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
+
+    def __eq__(self, other):
+        return isinstance(other, Op) and (
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.name is None or other.name is None or
+                 self.name == other.name))
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __str__(self):
+        if self.dtype is None:
+            dtype = 'None'
+        else:
+            dtype = str(self.dtype)
+        if self.name is None:
+            name = 'None'
+        else:
+            name = self.name
+        return "Op("+dtype+", "+name+")"
+
+
+class MemAccess(object):
+    """A memory access.
+
+    .. attribute:: mtype
+
+       A :class:`str` that specifies the memory type accessed as **global**
+       or **local**
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type accessed.
+
+    .. attribute:: stride
+
+       An :class:`int` that specifies stride of the memory access. A stride of 0
+       indicates a uniform access (i.e. all threads access the same item).
+
+    .. attribute:: direction
+
+       A :class:`str` that specifies the direction of memory access as
+       **load** or **store**.
+
+    .. attribute:: variable
+
+       A :class:`str` that specifies the variable name of the data
+       accessed.
+
+    """
+
+    def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
+        self.mtype = mtype
+        self.stride = stride
+        self.direction = direction
+        self.variable = variable
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
+
+        #TODO currently giving all lmem access stride=None
+        if (mtype == 'local') and (stride is not None):
+            raise NotImplementedError("MemAccess: stride must be None when "
+                                      "mtype is 'local'")
+
+        #TODO currently giving all lmem access variable=None
+        if (mtype == 'local') and (variable is not None):
+            raise NotImplementedError("MemAccess: variable must be None when "
+                                      "mtype is 'local'")
+
+    def __eq__(self, other):
+        return isinstance(other, MemAccess) and (
+                (self.mtype is None or other.mtype is None or
+                 self.mtype == other.mtype) and
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.stride is None or other.stride is None or
+                 self.stride == other.stride) and
+                (self.direction is None or other.direction is None or
+                 self.direction == other.direction) and
+                (self.variable is None or other.variable is None or
+                 self.variable == other.variable))
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __str__(self):
+        if self.mtype is None:
+            mtype = 'None'
+        else:
+            mtype = self.mtype
+        if self.dtype is None:
+            dtype = 'None'
+        else:
+            dtype = str(self.dtype)
+        if self.stride is None:
+            stride = 'None'
+        else:
+            stride = str(self.stride)
+        if self.direction is None:
+            direction = 'None'
+        else:
+            direction = self.direction
+        if self.variable is None:
+            variable = 'None'
+        else:
+            variable = self.variable
+        return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \
+               +variable+")"
+
+
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CombineMapper):
@@ -126,41 +523,33 @@ class ExpressionOpCounter(CombineMapper):
     map_tagged_variable = map_constant
     map_variable = map_constant
 
-    #def map_wildcard(self, expr):
-    #    return 0,0
-
-    #def map_function_symbol(self, expr):
-    #    return 0,0
-
     def map_call(self, expr):
         return ToCountMap(
-                    {(self.type_inf(expr), 'func:'+str(expr.function)): 1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
-    # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
-
-    def map_subscript(self, expr):  # implemented in CombineMapper
+    def map_subscript(self, expr):
         return self.rec(expr.index)
 
-    # def map_lookup(self, expr):  # implemented in CombineMapper
-
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
-                    {(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='add'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
+        return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({(self.type_inf(expr), 'mul'): -1})
+                   ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({(self.type_inf(expr), 'div'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -168,54 +557,47 @@ class ExpressionOpCounter(CombineMapper):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        return ToCountMap(
-                        {(self.type_inf(expr), 'bw'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'):
+                           len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
     map_bitwise_and = map_bitwise_or
 
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_logical_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_logical_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_if_branches_ops", 
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_ifpos_branches_ops",
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap(
-                        {(self.type_inf(expr), 'maxmin'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'):
+                           len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
 
@@ -225,11 +607,13 @@ class ExpressionOpCounter(CombineMapper):
                                   "map_common_subexpression not implemented.")
 
     def map_substitution(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered substitution, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "substitution, "
                                   "map_substitution not implemented.")
 
     def map_derivative(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered derivative, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "derivative, "
                                   "map_derivative not implemented.")
 
     def map_slice(self, expr):
@@ -239,6 +623,83 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
+# {{{ LocalSubscriptCounter
+
+class LocalSubscriptCounter(CombineMapper):
+
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.type_inference import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    map_tagged_variable = map_constant
+    map_variable = map_constant
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_subscript(self, expr):
+        sub_map = ToCountMap()
+        name = expr.aggregate.name  # name of array
+        if name in self.knl.temporary_variables:
+            array = self.knl.temporary_variables[name]
+            if array.is_local:
+                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
+        return sub_map + self.rec(expr.index)
+            
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    def map_if(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_common_subexpression(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "common_subexpression, "
+                                  "map_common_subexpression not implemented.")
+
+    def map_substitution(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "substitution, "
+                                  "map_substitution not implemented.")
+
+    def map_derivative(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "derivative, "
+                                  "map_derivative not implemented.")
+
+    def map_slice(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
+                                  "map_slice not implemented.")
+
+# }}}
+
+
 # {{{ GlobalSubscriptCounter
 
 class GlobalSubscriptCounter(CombineMapper):
@@ -278,33 +739,52 @@ class GlobalSubscriptCounter(CombineMapper):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
-        local_id0 = None
+
+        # find min tag axis
+        import sys
+        min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
-            # find local id0
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                if tag.axis == 0:
-                    local_id0 = iname
-                    break  # there will be only one local_id0
+                if tag.axis < min_tag_axis:
+                    min_tag_axis = tag.axis
 
         if not local_id_found:
             # count as uniform access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'uniform'): 1}
-                    ) + self.rec(expr.index)
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr), stride=0,
+                                         variable=name): 1}
+                             ) + self.rec(expr.index)
+
+        if min_tag_axis != 0:
+            warn_with_kernel(self.knl, "unknown_gmem_stride",
+                             "GlobalSubscriptCounter: Memory access minimum "
+                             "tag axis %d != 0, stride unknown, using "
+                             "sys.maxsize." % (min_tag_axis))
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr),
+                                         stride=sys.maxsize, variable=name): 1}
+                             ) + self.rec(expr.index)
+
+        # get local_id associated with minimum tag axis
+        min_lid = None
+        for iname in my_inames:
+            tag = self.knl.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                if tag.axis == min_tag_axis:
+                    min_lid = iname
+                    break  # there will be only one min local_id
 
-        if local_id0 is None:
-            # only non-zero local id(s) found, assume non-consecutive access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'nonconsecutive'): 1}
-                    ) + self.rec(expr.index)
+        # found local_id associated with minimum tag axis
 
-        # check coefficient of local_id0 for each axis
+        total_stride = 0
+        # check coefficient of min_lid for each axis
         from loopy.symbolic import CoefficientCollector
+        from loopy.kernel.array import FixedStrideArrayDimTag
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
 
@@ -312,36 +792,22 @@ class GlobalSubscriptCounter(CombineMapper):
             coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx))
             # check if he contains the lid 0 guy
             try:
-                coeff_id0 = coeffs[Variable(local_id0)]
+                coeff_min_lid = coeffs[Variable(min_lid)]
             except KeyError:
-                # does not contain local_id0
+                # does not contain min_lid
                 continue
-
-            if coeff_id0 != 1:
-                # non-consecutive access
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # coefficient is 1, now determine if stride is 1
-            from loopy.kernel.array import FixedStrideArrayDimTag
+            # found coefficient of min_lid
+            # now determine stride
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
                 continue
 
-            if stride != 1:
-                # non-consecutive
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
+            total_stride += stride*coeff_min_lid
 
-            # else, stride == 1, continue since another idx could contain id0
-
-        # loop finished without returning, stride==1 for every instance of local_id0
-        return ToCountMap(
-                {(self.type_inf(expr), 'consecutive'): 1}
-                ) + self.rec(expr.index)
+        return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
+                                     stride=total_stride, variable=name): 1}
+                         ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -351,48 +817,19 @@ class GlobalSubscriptCounter(CombineMapper):
 
     map_product = map_sum
 
-    def map_quotient(self, expr, *args):
-        return self.rec(expr.numerator) + self.rec(expr.denominator)
-
-    map_floor_div = map_quotient
-    map_remainder = map_quotient
-
-    def map_power(self, expr):
-        return self.rec(expr.base) + self.rec(expr.exponent)
-
-    def map_left_shift(self, expr):
-        return self.rec(expr.shiftee)+self.rec(expr.shift)
-
-    map_right_shift = map_left_shift
-
-    def map_bitwise_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_bitwise_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_bitwise_xor = map_bitwise_or
-    map_bitwise_and = map_bitwise_or
-
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    map_logical_not = map_bitwise_not
-    map_logical_or = map_bitwise_or
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        warn_with_kernel(self.knl, "summing_if_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
-
-    map_min = map_bitwise_or
-    map_max = map_min
+        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("GlobalSubscriptCounter encountered "
@@ -524,7 +961,8 @@ def count(kernel, set):
 
             # {{{ rebuild check domain
 
-            zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space))
+            zero = isl.Aff.zero_on_domain(
+                        isl.LocalSpace.from_space(bset.space))
             iname = isl.PwAff.from_aff(
                     zero.set_coefficient_val(isl.dim_type.in_, i, 1))
             dmin_matched = dmin.insert_dims(
@@ -584,31 +1022,44 @@ def get_op_poly(knl, numpy_types=True):
 
     """Count the number of operations in a loopy kernel.
 
+    get_op_poly is deprecated. Use get_op_map instead.
+
+    """
+    warn_with_kernel(knl, "depricated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
+
+def get_op_map(knl, numpy_types=True):
+
+    """Count the number of operations in a loopy kernel.
+
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **)**
-             **:** :class:`islpy.PwQPolynomial` **}**.
+    :parameter numpy_types: A :class:`bool` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:`loopy.LoopyType`.
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
+    :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
+             :class:`islpy.PwQPolynomial` **}**.
 
-             - The string specifies the operation type as
-               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+             - The :class:`Op` specifies the characteristics of the arithmetic
+               operation.
 
              - The :class:`islpy.PwQPolynomial` holds the number of operations of
                the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *parameter inames*).
+               :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        poly = get_op_poly(knl)
+        op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+        f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params)
+        f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -618,88 +1069,128 @@ def get_op_poly(knl, numpy_types=True):
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    op_poly = ToCountMap()
+    op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         # how many times is this instruction executed?
         # check domain size:
         insn_inames = knl.insn_inames(insn)
         inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(
+                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_poly = op_poly + ops*count(knl, domain)
-    result = op_poly.dict
+        op_map = op_map + ops*count(knl, domain)
 
     if numpy_types:
-        result = dict(
-                ((dtype.numpy_dtype, kind), count)
-                for (dtype, kind), count in six.iteritems(result))
+        op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+                                 count)
+                for op, count in six.iteritems(op_map.count_map))
 
-    return result
-# }}}
+    return op_map
 
 
-def sum_ops_to_dtypes(op_poly_dict):
-    result = {}
-    for (dtype, kind), v in op_poly_dict.items():
-        new_key = dtype
-        if new_key in result:
-            result[new_key] += v
-        else:
-            result[new_key] = v
+#TODO test deprecated functions?
+def get_lmem_access_poly(knl):
+    """Count the number of local memory accesses in a loopy kernel.
 
-    return result
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
+    """
+    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
+
+
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 
 # {{{ get_gmem_access_poly
-def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscripts
 
+def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
-                    counted.
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **,**
-             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.
+    """
+    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+# }}}
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
 
-             - The first string in the map key specifies the global memory
-               access type as
-               *consecutive*, *nonconsecutive*, or *uniform*.
+def get_mem_access_map(knl, numpy_types=True):
+    """Count the number of memory accesses in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+                    counted.
 
-             - The second string in the map key specifies the global memory
-               access type as a
-               *load*, or a *store*.
+    :parameter numpy_types: A :class:`bool` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:`loopy.LoopyType`.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
-               with the characteristics specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
+    :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
+             :class:`islpy.PwQPolynomial` **}**.
+
+             - The :class:`MemAccess` specifies the characteristics of the
+               memory access.
+
+             - The :class:`islpy.PwQPolynomial` holds the number of memory
+               accesses with the characteristics specified in the key (in terms
+               of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        subscript_map = get_gmem_access_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-
-        f32_uncoalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'nonconsecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_store = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'store')
-                            ].eval_with_dict(params)
+        mem_map = get_mem_access_map(knl)
+
+        f32_s1_g_ld_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='x')
+                               ].eval_with_dict(params)
+        f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='x')
+                               ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
     """
-
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
     class CacheHolder(object):
@@ -712,7 +1203,8 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
         if uniform:
             from loopy.kernel.data import LocalIndexTag
             insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+                           isinstance(
+                           knl.iname_to_tag.get(iname), LocalIndexTag)]
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(
                                 insn_inames, [dim_type.set]))
@@ -721,82 +1213,82 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_poly = ToCountMap()
-    subscript_counter = GlobalSubscriptCounter(knl)
+    subs_map = ToCountMap()
+    subs_counter_g = GlobalSubscriptCounter(knl)
+    subs_counter_l = LocalSubscriptCounter(knl)
+
     for insn in knl.instructions:
-        # count subscripts, distinguishing loads and stores
-        subs_expr = subscript_counter(insn.expression)
-        subs_expr = ToCountMap(dict(
-            (key + ("load",), val)
-            for key, val in six.iteritems(subs_expr.dict)))
-        subs_assignee = subscript_counter(insn.assignee)
-        subs_assignee = ToCountMap(dict(
-            (key + ("store",), val)
-            for key, val in six.iteritems(subs_assignee.dict)))
+        # count subscripts
+        subs_expr = subs_counter_g(insn.expression) \
+                    + subs_counter_l(insn.expression)
+
+        # distinguish loads and stores
+        for key in subs_expr.count_map:
+            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                stride=key.stride, direction='load',
+                                variable=key.variable)
+                     ] = subs_expr.pop(key)
+
+        subs_assignee_g = subs_counter_g(insn.assignee)
+        for key in subs_assignee_g.count_map:
+            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                      stride=key.stride,
+                                      direction='store',
+                                      variable=key.variable)
+                           ] = subs_assignee_g.pop(key)
+        # for now, don't count writes to local mem
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.dict:
-            poly = ToCountMap({key: subs_expr.dict[key]})
-            if key[1] == "uniform":
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+        for key in subs_expr.count_map:
+            map = ToCountMap({key: subs_expr[key]})
+            if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-        for key in subs_assignee.dict:
-            poly = ToCountMap({key: subs_assignee.dict[key]})
-            if key[1] == "uniform":
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+                #currently not counting stride of local mem access
+
+        for key in subs_assignee_g.count_map:
+            map = ToCountMap({key: subs_assignee_g[key]})
+            if isinstance(key.stride, int) and key.stride == 0:
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-
-    result = subs_poly.dict
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+            # for now, don't count writes to local mem
 
     if numpy_types:
-        result = dict(
-                ((dtype.numpy_dtype, kind, direction), count)
-                for (dtype, kind, direction), count in six.iteritems(result))
-
-    return result
+        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+                                             dtype=mem_access.dtype.numpy_dtype,
+                                             stride=mem_access.stride,
+                                             direction=mem_access.direction,
+                                             variable=mem_access.variable)
+                                   , count)
+                      for mem_access, count in six.iteritems(subs_map.count_map))
 
-
-def get_DRAM_access_poly(knl):
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead",
-            DeprecationWarning, stacklevel=2)
-    return get_gmem_access_poly(knl)
-
-# }}}
+    return subs_map
 
 
-# {{{ sum_mem_access_to_bytes
+# {{{ get_synchronization_poly
 
-def sum_mem_access_to_bytes(m):
-    """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping
+def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
 
-    **{(** :class:`string` **,** :class:`string` **)**
-    **:** :class:`islpy.PwQPolynomial` **}**
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
 
-    i.e., aggregate the transfer numbers for all types into a single byte count.
     """
-
-    result = {}
-    for (dtype, kind, direction), v in m.items():
-        new_key = (kind, direction)
-        bytes_transferred = int(dtype.itemsize) * v
-        if new_key in result:
-            result[new_key] += bytes_transferred
-        else:
-            result[new_key] = bytes_transferred
-
-    return result
+    warn_with_kernel(knl, "depricated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
+    return get_synchronization_map(knl)
 
 # }}}
 
 
-# {{{ get_synchronization_poly
-
-def get_synchronization_poly(knl):
+def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
@@ -804,8 +1296,8 @@ def get_synchronization_poly(knl):
     :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
-            :class:`islpy.PwQPolynomial` holding the number of such events
-            per thread.
+            :class:`islpy.PwQPolynomial` holding the number of events per
+            thread.
 
             Possible keys include ``barrier_local``, ``barrier_global``
             (if supported by the target) and ``kernel_launch``.
@@ -814,9 +1306,9 @@ def get_synchronization_poly(knl):
 
         # (first create loopy kernel and specify array data types)
 
-        barrier_poly = get_barrier_poly(knl)
+        sync_map = get_synchronization_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_count = barrier_poly.eval_with_dict(params)
+        barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
 
         # (now use this count to predict performance)
 
@@ -854,8 +1346,8 @@ def get_synchronization_poly(knl):
                 iname_list.pop()
 
         elif isinstance(sched_item, Barrier):
-            result = result + ToCountMap(
-                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})
+            result = result + ToCountMap({"barrier_%s" % sched_item.kind:
+                                          get_count_poly(iname_list)})
 
         elif isinstance(sched_item, CallKernel):
             result = result + ToCountMap(
@@ -868,9 +1360,8 @@ def get_synchronization_poly(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    return result.dict
-
-# }}}
+    #return result.count_map #TODO is this change okay?
+    return result
 
 
 # {{{ gather_access_footprints
@@ -881,7 +1372,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
     of each the array *var_name* are read/written (where
     *direction* is either ``read`` or ``write``.
 
-    :arg ignore_uncountable: If *True*, an error will be raised for
+    :arg ignore_uncountable: If *False*, an error will be raised for
         accesses on which the footprint cannot be determined (e.g.
         data-dependent or nonlinear indices)
     """
@@ -905,7 +1396,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
 
         insn_inames = kernel.insn_inames(insn)
         inames_domain = kernel.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(insn_inames,
+                                                   [dim_type.set]))
 
         afg = AccessFootprintGatherer(kernel, domain,
                 ignore_uncountable=ignore_uncountable)
@@ -947,7 +1439,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         kernel = preprocess_kernel(kernel)
 
     result = {}
-    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)
+    fp = gather_access_footprints(kernel,
+                                  ignore_uncountable=ignore_uncountable)
 
     for key, var_fp in fp.items():
         vname, direction = key
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 0c304b7a854579007f57ba204cbff8f440aaf5fc..c85aa80ec92eb0185d30f96b478ae37043c0d7e0 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
 
     if 1:
         print("OPS")
-        op_poly = lp.get_op_poly(hsv)
-        print(lp.stringify_stats_mapping(op_poly))
+        op_map = lp.get_op_map(hsv)
+        print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
-        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
-        print(lp.stringify_stats_mapping(gmem_poly))
+        gmem_map = lp.get_mem_access_map(hsv).to_bytes()
+        print(lp.stringify_stats_mapping(gmem_map))
 
     hsv = lp.set_options(hsv, cl_build_options=[
          "-cl-denorms-are-zero",
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 68be5b8a260858e058619c796b3836611c8d4f0f..fb502045c7b6b2c7e02d11ad3ebda3b5d13c8bda 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,8 +28,10 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
+from loopy.types import to_loopy_type
 import numpy as np
 
+from pymbolic.primitives import Variable
 
 def test_op_counter_basic():
 
@@ -44,21 +46,22 @@ def test_op_counter_basic():
             name="basic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
-
+    
 
 def test_op_counter_reduction():
 
@@ -70,15 +73,19 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    assert f32 == f32add + f32mul
+
 
 def test_op_counter_logic():
 
@@ -92,15 +99,15 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -120,24 +127,25 @@ def test_op_counter_specialops():
             name="specialops", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
-    assert f64pow == i32add == f64rsqrt == f64sin == n*m
+    assert f64pow == i32add == f64rsq == f64sin == n*m
 
 
 def test_op_counter_bitwise():
@@ -157,17 +165,17 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
-    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -196,9 +204,9 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
-    flops = poly.eval_with_dict(value_dict)
+    flops = op_map.eval_with_dict(value_dict)
 
     if expect_fallback:
         assert flops == 144
@@ -206,7 +214,7 @@ def test_op_counter_triangular_domain():
         assert flops == 78
 
 
-def test_gmem_access_counter_basic():
+def test_mem_access_counter_basic():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -220,31 +228,37 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                   ].eval_with_dict(params)
-    assert f32 == 3*n*m*l
-    assert f64 == 2*n*m
-
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                   ].eval_with_dict(params)
-    assert f32 == n*m*l
-    assert f64 == n*m
-
-
-def test_gmem_access_counter_reduction():
+    f32l = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32l += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64l = mem_map[lp.MemAccess('global', np.float64,
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64l += mem_map[lp.MemAccess('global', np.float64,
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
+    assert f32l == 3*n*m*l
+    assert f64l == 2*n*m
+
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
+    assert f32s == n*m*l
+    assert f64s == n*m
+
+
+def test_mem_access_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -254,23 +268,33 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*l
+    f32l = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32l += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    assert f32l == 2*n*m*l
+
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    assert f32s == n*l
 
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    assert f32 == n*l
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*f32l
+    assert st_bytes == 4*f32s
 
 
-def test_gmem_access_counter_logic():
+def test_mem_access_counter_logic():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -282,27 +306,29 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    assert f32 == 2*n*m
-    assert f64 == n*m
 
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    assert f64 == n*m
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='store')
+                         ].eval_with_dict(params)
+    assert f32_g_l == 2*n*m
+    assert f64_g_l == n*m
+    assert f64_g_s == n*m
 
 
-def test_gmem_access_counter_specialops():
+def test_mem_access_counter_specialops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -314,33 +340,43 @@ def test_gmem_access_counter_specialops():
             ],
             name="specialops", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
+    f32 = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32 += mem_map[lp.MemAccess('global', np.float32,
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
+    f32 = mem_map[lp.MemAccess('global', np.float32,
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64 = mem_map[lp.MemAccess('global', np.float64,
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
+    filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g'])
+    #tot = lp.eval_and_sum_polys(filtered_map, params)
+    tot = filtered_map.eval_and_sum(params)
+    assert tot == n*m*l + n*m
 
-def test_gmem_access_counter_bitwise():
+def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -357,23 +393,35 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'load')
-                    ].eval_with_dict(params)
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='load', variable='g')
+               ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='load', variable='h')
+               ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'store')
-                    ].eval_with_dict(params)
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
+                          stride=0, direction='store', variable='e')
+               ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
 
-def test_gmem_access_counter_mixed():
+def test_mem_access_counter_mixed():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -391,35 +439,44 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f32uniform = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+                                stride=0, direction='load', variable='g')
+                     ].eval_with_dict(params)
+    f64uniform += mem_map[lp.MemAccess('global', np.float64, 
+                                 stride=0, direction='load', variable='h')
+                      ].eval_with_dict(params)
+    f32uniform = mem_map[lp.MemAccess('global', np.float32, 
+                                stride=0, direction='load', variable='x')
+                     ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m'), direction='load',
+                                  variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m'), direction='load',
+                                   variable='b')
+                        ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+                                stride=0, direction='store', variable='e')
+                     ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+                                  stride=Variable('m'), direction='store',
+                                  variable='c')
+                       ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_nonconsec():
+def test_mem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -435,31 +492,43 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict(params)
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+                                  stride=Variable('m'), direction='load',
+                                  variable='g')
+                       ].eval_with_dict(params)
+    f64nonconsec += mem_map[lp.MemAccess('global', np.float64, 
+                                   stride=Variable('m'), direction='load',
+                                   variable='h')
+                        ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='load', variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m')*Variable('l'),
+                                   direction='load', variable='b')
+                        ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict(params)
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+                                  stride=Variable('m'), direction='store',
+                                  variable='e')
+                       ].eval_with_dict(params)
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='store', variable='c')
+                       ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_consec():
+def test_mem_access_counter_consec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -474,27 +543,36 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = lp.get_gmem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'load')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'load')
-                    ].eval_with_dict(params)
+    #for k in mem_map:
+    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k])
+
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='load', variable='g')
+                     ].eval_with_dict(params)
+    f64consec += mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='load', variable='h')
+                     ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='a')
+                     ].eval_with_dict(params)
+    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='b')
+                     ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'store')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'store')
-                    ].eval_with_dict(params)
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+                        stride=1, direction='store', variable='e')
+                     ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='store', variable='c')
+                     ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
 
@@ -511,15 +589,15 @@ def test_barrier_counter_nobarriers():
             ],
             name="basic", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    sync_poly = lp.get_synchronization_poly(knl)
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
+    sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    assert len(sync_poly) == 1
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
+    assert len(sync_map) == 1
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
 
 def test_barrier_counter_barriers():
@@ -539,13 +617,13 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
-    poly = lp.get_synchronization_poly(knl)
-    print(poly)
+    sync_map = lp.get_synchronization_map(knl)
+    print(sync_map)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly["barrier_local"].eval_with_dict(params)
+    barrier_count = sync_map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
 
@@ -560,50 +638,58 @@ def test_all_counters_parallel_matmul():
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    sync_poly = lp.get_synchronization_poly(knl)
-    assert len(sync_poly) == 1
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
+    sync_map = lp.get_synchronization_map(knl)
+    assert len(sync_map) == 2
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
 
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     f32mul = op_map[
-                        (np.dtype(np.float32), 'mul')
+                        lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        (np.dtype(np.float32), 'add')
+                        lp.Op(np.float32, 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        (np.dtype(np.int32), 'add')
+                        lp.Op(np.int32, 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        (np.dtype(np.int32), 'mul')
+                        lp.Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
-    assert i32ops == n*m*l*4 + l*n*4
 
-    subscript_map = lp.get_gmem_access_poly(knl)
-    f32uncoal = subscript_map[
-                        (np.dtype(np.float32), 'nonconsecutive', 'load')
-                        ].eval_with_dict(params)
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'load')
-                        ].eval_with_dict(params)
+    op_map = lp.get_mem_access_map(knl)
 
-    assert f32uncoal == n*m*l
-    assert f32coal == n*m*l
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='b')
+                            ].eval_with_dict(params)
+    f32coal += op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='load', variable='a')
+                            ].eval_with_dict(params)
 
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'store')
-                        ].eval_with_dict(params)
+    assert f32coal == n*m+m*l
+
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
+                        stride=1, direction='store', variable='c')
+                            ].eval_with_dict(params)
 
     assert f32coal == n*l
 
+    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
+    local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
+                                            direction='load')
+                                 ].eval_with_dict(params)
+    assert local_mem_l == n*m*l*2
 
 def test_gather_access_footprint():
     knl = lp.make_kernel(
@@ -637,6 +723,82 @@ def test_gather_access_footprint_2():
         print(key, count(knl, footprint))
 
 
+def test_summations_and_filters():
+
+    knl = lp.make_kernel(
+            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            [
+                """
+                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
+                e[i, k+1] = -g[i,k]*h[i,k+1]
+                """
+            ],
+            name="basic", assumptions="n,m,l >= 1")
+
+    knl = lp.add_and_infer_dtypes(knl,
+                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    n = 512
+    m = 256
+    l = 128
+    params = {'n': n, 'm': m, 'l': l}
+
+    mem_map = lp.get_mem_access_map(knl)
+
+    loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params)
+    assert loads_a == 2*n*m*l
+
+    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params)
+    assert global_stores == n*m*l + n*m
+
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
+    assert st_bytes == 4*n*m*l + 8*n*m
+
+    # ignore stride and variable names in this map
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
+    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
+                         ].eval_with_dict(params)
+    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
+                         ].eval_with_dict(params)
+    assert f32lall == 3*n*m*l
+    assert f64lall == 2*n*m
+
+    op_map = lp.get_op_map(knl)
+    #for k, v in op_map.items():
+    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
+
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
+    assert f32 == n*m*l*3
+    assert f64 == n*m
+    assert i32 == n*m*2
+
+    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
+    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
+    assert addsub_all == n*m*l + n*m*2
+    assert f32ops_all == n*m*l*3
+
+    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
+    assert non_field == 0
+
+    ops_nodtype = op_map.group_by('name')
+    ops_noname = op_map.group_by('dtype')
+    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
+    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    assert mul_all == n*m*l + n*m
+    assert f64ops_all == n*m
+
+    def func_filter(key):
+        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
+               key.direction == 'load'
+    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
+    assert s1f64l == 2*n*m
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])