diff --git a/doc/reference.rst b/doc/reference.rst index c7435bbf7f84570b6672ba1bf42eeab9d495ea56..eb10788cb07d8fb87cee5ce6edc90e11bb9db5c3 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -514,4 +514,13 @@ Controlling caching .. autoclass:: CacheMode +Obtaining Kernel Statistics +--------------------------- + +.. autofunction:: get_op_poly + +.. autofunction:: get_DRAM_access_poly + +.. autofunction:: get_barrier_poly + .. vim: tw=75:spell diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3180b5066823481c01b43f659ec851e3951fa5e4..4b4ce410948c3667c5798ecebd2081f837f6845c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -835,7 +835,7 @@ The loopy kernels we have seen thus far have consisted only of assignments from one global-memory storage location to another. Sometimes, computation results obviously get reused, so that recomputing them or even just re-fetching them from global memory becomes uneconomical. Loopy has -a number of different ways of adressing this need. +a number of different ways of addressing this need. Explicit private temporaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1183,6 +1183,307 @@ across the remaining axis of the workgroup would emerge. TODO +Gathering kernel statistics +--------------------------- + +Operations, array access, and barriers can all be counted, which may facilitate +performance prediction and optimization of a :mod:`loopy` kernel. + +.. note:: + + The functions used in the following examples may produce warnings. If you have + already made the filterwarnings and catch_warnings calls used in the examples + above, you may need to reset these before continuing: + + .. doctest:: + + >>> from warnings import resetwarnings + >>> resetwarnings() + +Counting operations +~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_op_poly` provides information on the number and type of operations +being performed in a kernel. To demonstrate this, we'll create an example kernel +that performs several operations on arrays containing different types of data: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + ... """ + ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] + ... e[i, k] = g[i,k]*(2+h[i,k+1]) + ... """) + >>> knl = lp.add_and_infer_dtypes(knl, + ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + +Note that loopy will infer the data types for arrays c and e from the +information provided. Now we will count the operations: + +.. doctest:: + + >>> from loopy.statistics import get_op_poly + >>> op_map = get_op_poly(knl) + +:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** +:class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the +number of operations for the :class:`numpy.dtype` specified in the key (in terms of +the :class:`loopy.LoopKernel` *inames*). We'll print this map now: + +.. doctest:: + + >>> print(op_map) + float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } + int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } + <BLANKLINE> + +We can evaluate these polynomials using :func:`islpy.eval_with_dict`: + +.. doctest:: + + >>> param_dict = {'n': 256, 'm': 256, 'l': 8} + >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict) + >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict) + >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict) + >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % + ... (i32ops, f32ops, f64ops)) + integer ops: 65536 + float32 ops: 1572864 + float64 ops: 131072 + +Counting array accesses +~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_DRAM_access_poly` provides information on the number and type of +array loads and stores being performed in a kernel. To demonstrate this, we'll +continue using the kernel from the previous example: + +.. doctest:: + + >>> from loopy.statistics import get_DRAM_access_poly + >>> load_store_map = get_DRAM_access_poly(knl) + >>> print(load_store_map) + (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } + <BLANKLINE> + +:func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** +:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** +**:** :class:`islpy.PwQPolynomial` **}**. + +- The :class:`numpy.dtype` specifies the type of the data being accessed. + +- The first string in the map key specifies the DRAM access type as *consecutive*, + *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when + consecutive threads access consecutive array elements in memory, *nonconsecutive* + accesses occur when consecutive threads access nonconsecutive array elements in + memory, and *uniform* accesses occur when consecutive threads access the *same* + element in memory. + +- The second string in the map key specifies the DRAM access type as a *load*, or a + *store*. + +- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the + characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` + *inames*). + +We can evaluate these polynomials using :func:`islpy.eval_with_dict`: + +.. doctest:: + + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "uniform", "load") + ... ].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "uniform", "store") + ... ].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "uniform", "load") + ... ].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "uniform", "store") + ... ].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % + ... (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 + +~~~~~~~~~~~ + +Since we have not tagged any of the inames or parallelized the kernel across threads +(which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers +the array accesses *uniform*. Now we'll parallelize the kernel and count the array +accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated +this time, so we'll print the mapping manually to make it more legible: + +.. doctest:: + + >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") + >>> load_store_map = get_DRAM_access_poly(knl_consec) + >>> for key in load_store_map.dict.keys(): + ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + (dtype('float32'), 'consecutive', 'load') : + [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float64'), 'consecutive', 'store') : + [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float64'), 'consecutive', 'load') : + [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float32'), 'consecutive', 'store') : + [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + +With this parallelization, consecutive threads will access consecutive array +elements in memory. The polynomials are a bit more complicated now due to the +parallelization, but when we evaluate them, we see that the total number of array +accesses has not changed: + +.. doctest:: + + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "consecutive", "load") + ... ].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "consecutive", "store") + ... ].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "consecutive", "load") + ... ].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "consecutive", "store") + ... ].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % + ... (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 + +~~~~~~~~~~~ + +To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in +our parallelization of the kernel: + +.. doctest:: + + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") + >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) + >>> for key in load_store_map.dict.keys(): + ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + (dtype('float32'), 'nonconsecutive', 'store') : + [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float64'), 'nonconsecutive', 'load') : + [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float64'), 'nonconsecutive', 'store') : + [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + (dtype('float32'), 'nonconsecutive', 'load') : + [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + <BLANKLINE> + +With this parallelization, consecutive threads will access *nonconsecutive* array +elements in memory. The total number of array accesses has not changed: + +.. doctest:: + + >>> f64ld = load_store_map.dict[ + ... (np.dtype(np.float64), "nonconsecutive", "load") + ... ].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[ + ... (np.dtype(np.float64), "nonconsecutive", "store") + ... ].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[ + ... (np.dtype(np.float32), "nonconsecutive", "load") + ... ].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[ + ... (np.dtype(np.float32), "nonconsecutive", "store") + ... ].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % + ... (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 + +Counting barriers +~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_barrier_poly` counts the number of barriers per **thread** in a +kernel. First, we'll call this function on the kernel from the previous example: + +.. doctest:: + + >>> from loopy.statistics import get_barrier_poly + >>> barrier_poly = get_barrier_poly(knl) + >>> print("Barrier polynomial: %s" % barrier_poly) + Barrier polynomial: { 0 } + +We can evaluate this polynomial using :func:`islpy.eval_with_dict`: + +.. doctest:: + + >>> barrier_count = barrier_poly.eval_with_dict(param_dict) + >>> print("Barrier count: %s" % barrier_count) + Barrier count: 0 + +Now to make things more interesting, we'll create a kernel with barriers: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", + ... [ + ... """ + ... c[i,j,k] = 2*a[i,j,k] + ... e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] + ... """ + ... ], [ + ... lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)), + ... "..." + ... ]) + >>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) + >>> knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") + >>> code, _ = lp.generate_code(lp.preprocess_kernel(knl)) + >>> print(code) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e) + { + __local int c[50 * 10 * 99]; + <BLANKLINE> + for (int i = 0; i <= 49; ++i) + for (int j = 0; j <= 9; ++j) + { + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1 + gid(0) * 128] = 2 * a[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128] = c[990 * i + 99 * j + 1 + lid(0) + 1 + gid(0) * 128] + c[990 * i + 99 * j + -1 + lid(0) + 1 + gid(0) * 128]; + } + } + + +In this kernel, when a thread performs the second instruction it uses data produced +by *different* threads during the first instruction. For correct execution barriers +are required, so loopy inserts them. Now we'll count the barriers using +:func:`loopy.get_barrier_poly`: + +.. doctest:: + + >>> barrier_poly = get_barrier_poly(knl) + >>> barrier_count = barrier_poly.eval_with_dict({}) + >>> print("Barrier polynomial: %s\nBarrier count: %i" % + ... (barrier_poly, barrier_count)) + Barrier polynomial: { 1000 } + Barrier count: 1000 + +Based on the kernel code printed above, we would expect each thread to encounter +50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In +this case, the number of barriers does not depend on any inames, so we can pass an +empty dictionary to :func:`islpy.eval_with_dict`. + .. }}} .. vim: tw=75:spell:foldmethod=marker diff --git a/loopy/__init__.py b/loopy/__init__.py index c63aa5f90d6537496ea9fe4ecb11c441070c91b4..f6f611f6538911678ecdd12578ca9ab17d0657f2 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -63,6 +63,7 @@ from loopy.padding import (split_arg_axis, find_padding_multiple, from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel +from loopy.statistics import get_op_poly, get_DRAM_access_poly, get_barrier_poly from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -102,6 +103,8 @@ __all__ = [ "generate_loop_schedules", "get_one_scheduled_kernel", "generate_code", "generate_body", + "get_op_poly", "get_DRAM_access_poly", "get_barrier_poly", + "CompiledKernel", "auto_test_vs_ref", diff --git a/loopy/statistics.py b/loopy/statistics.py index 82f36a659d06404f19251701e5b21c55098ec530..041720153726792026668260f9b77de59c6e1a45 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -76,7 +76,10 @@ class ToCountMap: return isl.PwQPolynomial('{ 0 }') def __str__(self): - return str(self.dict) + result = "" + for key in self.dict.keys(): + result += ("%s : %s\n" % (key, self.dict[key])) + return result def __repr__(self): return repr(self.dict) @@ -403,8 +406,32 @@ def count(kernel, bset): return result -# to evaluate poly: poly.eval_with_dict(dictionary) def get_op_poly(knl): + + """Count the number of operations in a loopy kernel. + + :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. + + :return: A mapping of **{** :class:`numpy.dtype` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`islpy.PwQPolynomial` holds the number of operations for + the :class:`numpy.dtype` specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + poly = get_op_poly(knl) + params = {'n': 512, 'm': 256, 'l': 128} + float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(params) + float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) @@ -423,6 +450,49 @@ def get_op_poly(knl): def get_DRAM_access_poly(knl): # for now just counting subscripts + + """Count the number of DRAM accesses in a loopy kernel. + + :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be + counted. + + :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** + :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. + + - The :class:`numpy.dtype` specifies the type of the data being + accessed. + + - The first string in the map key specifies the DRAM access type as + *consecutive*, *nonconsecutive*, or *uniform*. + + - The second string in the map key specifies the DRAM access type as a + *load*, or a *store*. + + - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + subscript_map = get_DRAM_access_poly(knl) + params = {'n': 512, 'm': 256, 'l': 128} + + f32_uncoalesced_load = subscript_map.dict[ + (np.dtype(np.float32), 'nonconsecutive', 'load') + ].eval_with_dict(params) + f32_coalesced_load = subscript_map.dict[ + (np.dtype(np.float32), 'consecutive', 'load') + ].eval_with_dict(params) + f32_coalesced_store = subscript_map.dict[ + (np.dtype(np.float32), 'consecutive', 'store') + ].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) @@ -433,7 +503,6 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) - subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap(dict( (key + ("load",), val) @@ -449,6 +518,26 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts def get_barrier_poly(knl): + + """Count the number of barriers each thread encounters in a loopy kernel. + + :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + + :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls + made (in terms of the :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + barrier_poly = get_barrier_poly(knl) + params = {'n': 512, 'm': 256, 'l': 128} + barrier_count = barrier_poly.eval_with_dict(params) + + # (now use this count to predict performance) + + """ + from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier from operator import mul diff --git a/test/test_statistics.py b/test/test_statistics.py index 80c9738d68791fc892e4939148939d3db509d637..87ed797e74fd709c29ad9d763e195ff46985ed96 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -415,6 +415,15 @@ def test_DRAM_access_counter_nonconsec(): assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l + f64nonconsec = poly.dict[ + (np.dtype(np.float64), 'nonconsecutive', 'store') + ].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32nonconsec = poly.dict[ + (np.dtype(np.float32), 'nonconsecutive', 'store') + ].eval_with_dict({'n': n, 'm': m, 'l': l}) + assert f64nonconsec == n*m + assert f32nonconsec == n*m*l + def test_DRAM_access_counter_consec(): @@ -435,7 +444,7 @@ def test_DRAM_access_counter_consec(): n = 512 m = 256 l = 128 - print(poly) + f64consec = poly.dict[ (np.dtype(np.float64), 'consecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) @@ -445,6 +454,15 @@ def test_DRAM_access_counter_consec(): assert f64consec == 2*n*m assert f32consec == 3*n*m*l + f64consec = poly.dict[ + (np.dtype(np.float64), 'consecutive', 'store') + ].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32consec = poly.dict[ + (np.dtype(np.float32), 'consecutive', 'store') + ].eval_with_dict({'n': n, 'm': m, 'l': l}) + assert f64consec == n*m + assert f32consec == n*m*l + def test_barrier_counter_nobarriers(): @@ -509,7 +527,7 @@ def test_all_counters_parallel_matmul(): m = 256 l = 128 - barrier_count = get_barrier_poly(knl).eval_with_dict({'n': n, 'm': n, 'l': n}) + barrier_count = get_barrier_poly(knl).eval_with_dict({'n': n, 'm': m, 'l': l}) assert barrier_count == 0 op_map = get_op_poly(knl) @@ -540,6 +558,7 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])