diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 3180b5066823481c01b43f659ec851e3951fa5e4..71dad63e0c2ee28151aae8be24fd4e67e18cf106 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1183,6 +1183,94 @@ across the remaining axis of the workgroup would emerge.
 
 TODO
 
+Gathering kernel statistics
+---------------------------
+
+Operations, array access, and barriers can all be counted, which may facilitate performance prediction and optimization of a :mod:`loopy` kernel.
+
+.. note::
+
+    The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples above, you may need to reset these before continuing:
+
+    .. doctest::
+
+        >>> from warnings import resetwarnings
+        >>> resetwarnings()
+
+Counting operations
+~~~~~~~~~~~~~~~~~~~
+
+:func:`loopy.get_op_poly` provides information on the number and type of operations being performed in a kernel. To demonstrate this, we'll create an example kernel that performs several operations on arrays containing different types of data:
+
+.. doctest::
+
+    >>> knl = lp.make_kernel(
+    ...     "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+    ...     """
+    ...     c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
+    ...     e[i, k] = g[i,k]*(2+h[i,k+1])
+    ...     """)
+    >>> knl = lp.add_and_infer_dtypes(knl,
+                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+
+Note that loopy will infer the data types for arrays c and e from the information provided. Now we will count the operations:
+
+.. doctest::
+
+    >>> op_map = get_op_poly(knl)
+
+:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** :class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the number of operations for the :class:`numpy.dtype` specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this map now:
+
+.. doctest::
+
+    >>> for key in op_map.dict.keys():
+    ...     print("%s : %s" % (key, op_map.dict[key]))
+    float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
+    int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
+
+We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+
+.. doctest::
+
+    >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
+    >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict)
+    >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict)
+    >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict)
+    >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % (i32ops, f32ops, f64ops))
+    integer ops: 65536
+    float32 ops: 1572864
+    float64 ops: 131072
+
+Counting array accesses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`loopy.get_DRAM_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example.
+
+:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.
+
+- The :class:`numpy.dtype` specifies the type of the data being accessed.
+
+- The first string in the map key specifies the DRAM access type as *consecutive*, *nonconsecutive*, or *uniform*.
+
+- The second string in the map key specifies the DRAM access type as a *load*, or a *store*.
+
+- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*).
+
+We will call :func:`loopy.get_DRAM_access_poly` on our example kernel now:
+
+.. doctest::
+
+    >>> from loopy.statistics import get_DRAM_access_poly
+
+    >>> load_store_map = get_DRAM_access_poly(knl)
+    >>> for key in load_store_map.dict.keys():
+    ...     print("%s : %s" % (key, load_store_map.dict[key]))
+    (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
+
 .. }}}
 
 .. vim: tw=75:spell:foldmethod=marker
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5c58d66c8786840c24afa4d9575762b17b96a220..e9fcdb2dc685cecea26e5df5deeda99bdc7b17af 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -418,15 +418,21 @@ def get_op_poly(knl):
 
     Example usage::
 
+        # first create loopy kernel and specify array data types
+
         poly = get_op_poly(knl)
+
         n = 512
         m = 256
         l = 128
+
         float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(
                                             {'n': n, 'm': m, 'l': l})
         float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(
                                             {'n': n, 'm': m, 'l': l})
 
+        # now use these counts to predict performance
+
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -472,7 +478,10 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
 
     Example usage::
 
+        # first create loopy kernel and specify array data types
+
         subscript_map = get_DRAM_access_poly(knl)
+
         f32_uncoalesced_load = subscript_map.dict[
                             (np.dtype(np.float32), 'nonconsecutive', 'load')
                             ].eval_with_dict({'n': n, 'm': m, 'l': l})
@@ -483,6 +492,8 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
                             (np.dtype(np.float32), 'consecutive', 'store')
                             ].eval_with_dict({'n': n, 'm': m, 'l': l})
 
+        # now use these counts to predict performance
+
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -521,6 +532,8 @@ def get_barrier_poly(knl):
 
     Example usage::
 
+        # first create loopy kernel and specify array data types
+
         barrier_poly = get_barrier_poly(knl)
 
         n = 512
@@ -529,6 +542,8 @@ def get_barrier_poly(knl):
 
         barrier_count = barrier_poly.eval_with_dict({'n': n, 'm': m, 'l': l})
 
+        # now use this count to predict performance
+
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 56cc22f8fcfc967062a8284f3510a3d25a2af63e..3981a3cb775cb0a218e3f03eefb3491e74087a73 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -558,6 +558,8 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
+    1/0
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])