diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3180b5066823481c01b43f659ec851e3951fa5e4..71dad63e0c2ee28151aae8be24fd4e67e18cf106 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1183,6 +1183,94 @@ across the remaining axis of the workgroup would emerge. TODO +Gathering kernel statistics +--------------------------- + +Operations, array access, and barriers can all be counted, which may facilitate performance prediction and optimization of a :mod:`loopy` kernel. + +.. note:: + + The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples above, you may need to reset these before continuing: + + .. doctest:: + + >>> from warnings import resetwarnings + >>> resetwarnings() + +Counting operations +~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_op_poly` provides information on the number and type of operations being performed in a kernel. To demonstrate this, we'll create an example kernel that performs several operations on arrays containing different types of data: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + ... """ + ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] + ... e[i, k] = g[i,k]*(2+h[i,k+1]) + ... """) + >>> knl = lp.add_and_infer_dtypes(knl, + dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + +Note that loopy will infer the data types for arrays c and e from the information provided. Now we will count the operations: + +.. doctest:: + + >>> op_map = get_op_poly(knl) + +:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** :class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the number of operations for the :class:`numpy.dtype` specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this map now: + +.. doctest:: + + >>> for key in op_map.dict.keys(): + ... print("%s : %s" % (key, op_map.dict[key])) + float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } + int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } + +We can evaluate these polynomials using :func:`islpy.eval_with_dict`: + +.. doctest:: + + >>> param_dict = {'n': 256, 'm': 256, 'l': 8} + >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict) + >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict) + >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict) + >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % (i32ops, f32ops, f64ops)) + integer ops: 65536 + float32 ops: 1572864 + float64 ops: 131072 + +Counting array accesses +~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_DRAM_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example. + +:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. + +- The :class:`numpy.dtype` specifies the type of the data being accessed. + +- The first string in the map key specifies the DRAM access type as *consecutive*, *nonconsecutive*, or *uniform*. + +- The second string in the map key specifies the DRAM access type as a *load*, or a *store*. + +- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). + +We will call :func:`loopy.get_DRAM_access_poly` on our example kernel now: + +.. doctest:: + + >>> from loopy.statistics import get_DRAM_access_poly + + >>> load_store_map = get_DRAM_access_poly(knl) + >>> for key in load_store_map.dict.keys(): + ... print("%s : %s" % (key, load_store_map.dict[key])) + (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } + .. }}} .. vim: tw=75:spell:foldmethod=marker diff --git a/loopy/statistics.py b/loopy/statistics.py index 5c58d66c8786840c24afa4d9575762b17b96a220..e9fcdb2dc685cecea26e5df5deeda99bdc7b17af 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -418,15 +418,21 @@ def get_op_poly(knl): Example usage:: + # first create loopy kernel and specify array data types + poly = get_op_poly(knl) + n = 512 m = 256 l = 128 + float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict( {'n': n, 'm': m, 'l': l}) float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict( {'n': n, 'm': m, 'l': l}) + # now use these counts to predict performance + """ from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -472,7 +478,10 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts Example usage:: + # first create loopy kernel and specify array data types + subscript_map = get_DRAM_access_poly(knl) + f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) @@ -483,6 +492,8 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) + # now use these counts to predict performance + """ from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -521,6 +532,8 @@ def get_barrier_poly(knl): Example usage:: + # first create loopy kernel and specify array data types + barrier_poly = get_barrier_poly(knl) n = 512 @@ -529,6 +542,8 @@ def get_barrier_poly(knl): barrier_count = barrier_poly.eval_with_dict({'n': n, 'm': m, 'l': l}) + # now use this count to predict performance + """ from loopy.preprocess import preprocess_kernel, infer_unknown_types diff --git a/test/test_statistics.py b/test/test_statistics.py index 56cc22f8fcfc967062a8284f3510a3d25a2af63e..3981a3cb775cb0a218e3f03eefb3491e74087a73 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -558,6 +558,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l + 1/0 + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])