added docstrings to statistics functions, began statistics section in tutorial)

a804307f · James Stevens · ffcdd8b0 · a804307f · a804307f · a804307f
Commit a804307f authored 9 years ago by James Stevens
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1183,6 +1183,94 @@ across the remaining axis of the workgroup would emerge.
 TODO
+Gathering kernel statistics
+---------------------------
+Operations, array access, and barriers can all be counted, which may facilitate performance prediction and optimization of a :mod:`loopy` kernel.
+.. note::
+    The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples above, you may need to reset these before continuing:
+    .. doctest::
+        >>> from warnings import resetwarnings
+        >>> resetwarnings()
+Counting operations
+~~~~~~~~~~~~~~~~~~~
+:func:`loopy.get_op_poly` provides information on the number and type of operations being performed in a kernel. To demonstrate this, we'll create an example kernel that performs several operations on arrays containing different types of data:
+.. doctest::
+    >>> knl = lp.make_kernel(
+    ...     "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+    ...     """
+    ...     c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
+    ...     e[i, k] = g[i,k]*(2+h[i,k+1])
+    ...     """)
+    >>> knl = lp.add_and_infer_dtypes(knl,
+                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+Note that loopy will infer the data types for arrays c and e from the information provided. Now we will count the operations:
+.. doctest::
+    >>> op_map = get_op_poly(knl)
+:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** :class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the number of operations for the :class:`numpy.dtype` specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this map now:
+.. doctest::
+    >>> for key in op_map.dict.keys():
+    ...     print("%s : %s" % (key, op_map.dict[key]))
+    float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
+    int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
+We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+.. doctest::
+    >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
+    >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict)
+    >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict)
+    >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict)
+    >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % (i32ops, f32ops, f64ops))
+    integer ops: 65536
+    float32 ops: 1572864
+    float64 ops: 131072
+Counting array accesses
+~~~~~~~~~~~~~~~~~~~~~~~
+:func:`loopy.get_DRAM_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example.
+:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.
+- The :class:`numpy.dtype` specifies the type of the data being accessed.
+- The first string in the map key specifies the DRAM access type as *consecutive*, *nonconsecutive*, or *uniform*.
+- The second string in the map key specifies the DRAM access type as a *load*, or a *store*.
+- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*).
+We will call :func:`loopy.get_DRAM_access_poly` on our example kernel now:
+.. doctest::
+    >>> from loopy.statistics import get_DRAM_access_poly
+    >>> load_store_map = get_DRAM_access_poly(knl)
+    >>> for key in load_store_map.dict.keys():
+    ...     print("%s : %s" % (key, load_store_map.dict[key]))
+    (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
 .. }}}
 .. vim: tw=75:spell:foldmethod=marker
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -418,15 +418,21 @@ def get_op_poly(knl):
    Example usage::
+        # first create loopy kernel and specify array data types
        poly = get_op_poly(knl)
        n = 512
        m = 256
        l = 128
        float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(
                                            {'n': n, 'm': m, 'l': l})
        float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(
                                            {'n': n, 'm': m, 'l': l})
+        # now use these counts to predict performance
    """
    from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -472,7 +478,10 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
    Example usage::
+        # first create loopy kernel and specify array data types
        subscript_map = get_DRAM_access_poly(knl)
        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict({'n': n, 'm': m, 'l': l})
@@ -483,6 +492,8 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict({'n': n, 'm': m, 'l': l})
+        # now use these counts to predict performance
    """
    from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -521,6 +532,8 @@ def get_barrier_poly(knl):
    Example usage::
+        # first create loopy kernel and specify array data types
        barrier_poly = get_barrier_poly(knl)
        n = 512
@@ -529,6 +542,8 @@ def get_barrier_poly(knl):
        barrier_count = barrier_poly.eval_with_dict({'n': n, 'm': m, 'l': l})
+        # now use this count to predict performance
    """
    from loopy.preprocess import preprocess_kernel, infer_unknown_types

--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -558,6 +558,8 @@ def test_all_counters_parallel_matmul():
    assert f32coal == n*l
+    1/0
 if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])