diff --git a/doc/reference.rst b/doc/reference.rst
index c7435bbf7f84570b6672ba1bf42eeab9d495ea56..eb10788cb07d8fb87cee5ce6edc90e11bb9db5c3 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -514,4 +514,13 @@ Controlling caching
 
 .. autoclass:: CacheMode
 
+Obtaining Kernel Statistics
+---------------------------
+
+.. autofunction:: get_op_poly
+
+.. autofunction:: get_DRAM_access_poly
+
+.. autofunction:: get_barrier_poly
+
 .. vim: tw=75:spell
diff --git a/loopy/__init__.py b/loopy/__init__.py
index c63aa5f90d6537496ea9fe4ecb11c441070c91b4..f6f611f6538911678ecdd12578ca9ab17d0657f2 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -63,6 +63,7 @@ from loopy.padding import (split_arg_axis, find_padding_multiple,
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
+from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -102,6 +103,8 @@ __all__ = [
         "generate_loop_schedules", "get_one_scheduled_kernel",
         "generate_code", "generate_body",
 
+        "get_op_poly", "get_DRAM_access_poly", "get_barrier_poly",
+
         "CompiledKernel",
 
         "auto_test_vs_ref",
diff --git a/loopy/compiled.py b/loopy/compiled.py
index da659eaba5e3c7c8c99993946da1e7af5bb399bc..a29f357b08e4383c1e6ea1b9b79db8330e8498a0 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -781,10 +781,11 @@ class _CLKernelInfo(Record):
 class CompiledKernel:
     def __init__(self, context, kernel):
         """
-        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
-            (a warning will be issued if more than one is returned). If the
-            kernel has not yet been loop-scheduled, that is done, too, with no
+        :arg kernel: may be a loopy.LoopKernel, a generator returning kernels \
+            (a warning will be issued if more than one is returned). If the \
+            kernel has not yet been loop-scheduled, that is done, too, with no \
             specific arguments.
+
         """
 
         self.context = context
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 85ac4c77bdad2573b6a7a033aa0e17221e010d59..5c58d66c8786840c24afa4d9575762b17b96a220 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -403,8 +403,32 @@ def count(kernel, bset):
     return result
 
 
-# to evaluate poly: poly.eval_with_dict(dictionary)
 def get_op_poly(knl):
+
+    """Count the number of operations in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+
+    :return: A mapping of **{** :class:`numpy.dtype` \
+             **:** :class:`islpy.PwQPolynomial` **}**.
+
+             - The :class:`islpy.PwQPolynomial` holds the number of operations for \
+             the :class:`numpy.dtype` specified in the key (in terms of \
+             the :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        poly = get_op_poly(knl)
+        n = 512
+        m = 256
+        l = 128
+        float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(
+                                            {'n': n, 'm': m, 'l': l})
+        float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(
+                                            {'n': n, 'm': m, 'l': l})
+
+    """
+
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
@@ -423,6 +447,44 @@ def get_op_poly(knl):
 
 
 def get_DRAM_access_poly(knl):  # for now just counting subscripts
+
+    """Count the number of DRAM accesses in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` \
+                    whose DRAM accesses are to be counted.
+
+    :return: A mapping of **{(** \
+             :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` \
+             **)** **:** :class:`islpy.PwQPolynomial` **}**.
+
+             - The :class:`numpy.dtype` specifies \
+                the type of the data being accessed.
+
+             - The first string in the map key specifies the DRAM access type as \
+                *consecutive*, *nonconsecutive*, or *uniform*.
+
+             - The second string in the map key specifies the DRAM access type as \
+                a *load*, or a *store*.
+
+             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses \
+                with the characteristics specified in the key (in terms of the \
+                :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        subscript_map = get_DRAM_access_poly(knl)
+        f32_uncoalesced_load = subscript_map.dict[
+                            (np.dtype(np.float32), 'nonconsecutive', 'load')
+                            ].eval_with_dict({'n': n, 'm': m, 'l': l})
+        f32_coalesced_load = subscript_map.dict[
+                            (np.dtype(np.float32), 'consecutive', 'load')
+                            ].eval_with_dict({'n': n, 'm': m, 'l': l})
+        f32_coalesced_store = subscript_map.dict[
+                            (np.dtype(np.float32), 'consecutive', 'store')
+                            ].eval_with_dict({'n': n, 'm': m, 'l': l})
+
+    """
+
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
@@ -448,6 +510,27 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
 
 
 def get_barrier_poly(knl):
+
+    """Count the number of barriers in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` \
+                    whose barriers are to be counted.
+
+    :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls \
+             made (in terms of the :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        barrier_poly = get_barrier_poly(knl)
+
+        n = 512
+        m = 256
+        l = 128
+
+        barrier_count = barrier_poly.eval_with_dict({'n': n, 'm': m, 'l': l})
+
+    """
+
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     from loopy.schedule import EnterLoop, LeaveLoop, Barrier
     from operator import mul