diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 87daa9fc4fc01b0625066cfd7c934c046b546930..c633e55dec69ee5fad981ec1812c93ba5349ab65 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -176,7 +176,7 @@ by passing :attr:`loopy.Options.write_cl`. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -250,7 +250,7 @@ call :func:`loopy.generate_code`: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -365,7 +365,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -414,7 +414,7 @@ Now the intended code is generated and our test passes. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -557,12 +557,14 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... + #define gid(N) ((int) get_group_id(N)) + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict a, int const n) + { for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; - ... + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; + } Notice how loopy has automatically generated guard conditionals to make sure the bounds on the old iname are obeyed. @@ -701,8 +703,9 @@ Let's try this out on our vector fill kernel by creating workgroups of size >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... - __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) + #define gid(N) ((int) get_group_id(N)) + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[128 * gid(0) + lid(0)] = 0.0f; @@ -1182,7 +1185,7 @@ When we ask to see the code, the issue becomes apparent: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out) { float a_fetch[16]; <BLANKLINE> @@ -1207,26 +1210,30 @@ Obtaining Performance Statistics .. {{{ -Operations, array access, and barriers can all be counted, which may facilitate -performance prediction and optimization of a :mod:`loopy` kernel. +Arithmetic operations, array accesses, and synchronization operations can all +be counted, which may facilitate performance prediction and optimization of a +:mod:`loopy` kernel. .. note:: The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples - above, you may need to reset these before continuing: + above, you may want to reset these before continuing. We will temporarily + supress warnings to keep the output clean: .. doctest:: - >>> from warnings import resetwarnings + >>> from warnings import resetwarnings, filterwarnings >>> resetwarnings() + >>> filterwarnings('ignore', category=Warning) Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_poly` provides information on the number and type of operations -being performed in a kernel. To demonstrate this, we'll create an example kernel -that performs several operations on arrays containing different types of data: +:func:`loopy.get_op_map` provides information on the number and type of +arithmetic operations being performed in a kernel. To demonstrate this, we'll +create an example kernel that performs several operations on arrays containing +different types of data: .. doctest:: @@ -1244,37 +1251,36 @@ information provided. Now we will count the operations: .. doctest:: - >>> from loopy.statistics import get_op_poly - >>> op_map = get_op_poly(knl) + >>> op_map = lp.get_op_map(knl) -:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** -:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The -:class:`islpy.PwQPolynomial` holds the number of operations for the type specified -in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this -map now: +:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. The +:class:`islpy.PwQPolynomial` holds the number of operations for the kind of +operation specified in the key(in terms of the :class:`loopy.LoopKernel` +*inames*). We'll print this map now: .. doctest:: >>> print(lp.stringify_stats_mapping(op_map)) - (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } <BLANKLINE> -We can evaluate these polynomials using :func:`islpy.eval_with_dict`: +One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict) - >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict) - >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict) - >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict) - >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict) - >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1284,174 +1290,156 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: 65536 65536 -Counting array accesses -~~~~~~~~~~~~~~~~~~~~~~~ +Counting memory accesses +~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_gmem_access_poly` provides information on the number and type of -array loads and stores being performed in a kernel. To demonstrate this, we'll -continue using the kernel from the previous example: +:func:`loopy.get_mem_access_map` provides information on the number and +characteristics of memory accesses performed in a kernel. To demonstrate this, +we'll continue using the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_gmem_access_poly - >>> load_store_map = get_gmem_access_poly(knl) - >>> print(lp.stringify_stats_mapping(load_store_map)) - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + >>> mem_map = lp.get_mem_access_map(knl) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } <BLANKLINE> -:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** -:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** -**:** :class:`islpy.PwQPolynomial` **}**. +:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. +:class:`loopy.MemAccess` attributes include: -- The :class:`numpy.dtype` specifies the type of the data being accessed. +- mtype: A :class:`str` that specifies the memory type accessed as **global** + or **local** -- The first string in the map key specifies the DRAM access type as *consecutive*, - *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when - consecutive threads access consecutive array elements in memory, *nonconsecutive* - accesses occur when consecutive threads access nonconsecutive array elements in - memory, and *uniform* accesses occur when consecutive threads access the *same* - element in memory. +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. -- The second string in the map key specifies the DRAM access type as a *load*, or a - *store*. +- stride: An :class:`int` that specifies stride of the memory access. A stride + of 0 indicates a uniform access (i.e. all threads access the same item). -- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the - characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` - *inames*). +- direction: A :class:`str` that specifies the direction of memory access as + **load** or **store**. + +- variable: A :class:`str` that specifies the variable name of the data + accessed. We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers -the array accesses *uniform*. Now we'll parallelize the kernel and count the array -accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated -this time, so we'll print the mapping manually to make it more legible: +Since we have not tagged any of the inames or parallelized the kernel across +threads (which would have produced iname tags), :func:`loopy.get_mem_access_map` +considers the memory accesses *uniform*, so the *stride* of each access is 0. +Now we'll parallelize the kernel and count the array accesses again. The +resulting :class:`islpy.PwQPolynomial` will be more complicated this time. .. doctest:: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_gmem_access_poly(knl_consec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'consecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float32'), 'consecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float64'), 'consecutive', 'load') : - [n, m, l] -> { ... } + >>> mem_map = lp.get_mem_access_map(knl_consec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } <BLANKLINE> - (dtype('float64'), 'consecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the -parallelization, but when we evaluate them, we see that the total number of array -accesses has not changed: +parallelization, but when we evaluate them, we see that the total number of +array accesses has not changed: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in -our parallelization of the kernel: +To produce *nonconsecutive* array accesses with stride greater than 1, we'll +switch the inner and outer tags in our parallelization of the kernel: .. doctest:: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_gmem_access_poly(knl_nonconsec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float32'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } + >>> mem_map = lp.get_mem_access_map(knl_nonconsec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } <BLANKLINE> - (dtype('float64'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float64'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - -With this parallelization, consecutive threads will access *nonconsecutive* array -elements in memory. The total number of array accesses has not changed: +With this parallelization, consecutive threads will access *nonconsecutive* +array elements in memory. The total number of array accesses still has not +changed: .. doctest:: - >>> f64ld = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_synchronization_poly` counts the number of synchronization +:func:`loopy.get_synchronization_map` counts the number of synchronization events per **thread** in a kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_synchronization_poly - >>> barrier_poly = get_synchronization_poly(knl) - >>> print(lp.stringify_stats_mapping(barrier_poly)) + >>> sync_map = lp.get_synchronization_map(knl) + >>> print(lp.stringify_stats_mapping(sync_map)) kernel_launch : { 1 } <BLANKLINE> @@ -1459,7 +1447,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1485,7 +1473,7 @@ Now to make things more interesting, we'll create a kernel with barriers: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e) + __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e) { __local int c[50 * 10 * 99]; <BLANKLINE> @@ -1499,24 +1487,24 @@ Now to make things more interesting, we'll create a kernel with barriers: } } - -In this kernel, when a thread performs the second instruction it uses data produced -by *different* threads during the first instruction. Because of this, barriers are -required for correct execution, so loopy inserts them. Now we'll count the barriers -using :func:`loopy.get_barrier_poly`: +In this kernel, when a thread performs the second instruction it uses data +produced by *different* threads during the first instruction. Because of this, +barriers are required for correct execution, so loopy inserts them. Now we'll +count the barriers using :func:`loopy.get_synchronization_map`: .. doctest:: - >>> sync_map = lp.get_synchronization_poly(knl) + >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) barrier_local : { 1000 } kernel_launch : { 1 } <BLANKLINE> -Based on the kernel code printed above, we would expect each thread to encounter -50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In -this case, the number of barriers does not depend on any inames, so we can pass an -empty dictionary to :func:`islpy.eval_with_dict`. +Based on the kernel code printed above, we would expect each thread to +encounter 50x10x2 barriers, which matches the result from +:func:`loopy.get_synchronization_map`. In this case, the number of barriers +does not depend on any inames, so we can pass an empty dictionary to +:func:`islpy.eval_with_dict`. .. }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index ec10722e1d9ec63a8ab2b6e785a04bec9b4f4c6a..468a274d76b8dcf133f30ba3af7119098fd330df 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1043,22 +1043,22 @@ def count(kernel, set): if not (is_subset and is_superset): if is_subset: - from loopy.diagnostic import warn - warn(kernel, "count_overestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines have overestimated the " "number of integer points in your loop " "domain.") elif is_superset: - from loopy.diagnostic import warn - warn(kernel, "count_underestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_underestimate", "Barvinok wrappers are not installed. " "Counting routines have underestimated the " "number of integer points in your loop " "domain.") else: - from loopy.diagnostic import warn - warn(kernel, "count_misestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_misestimate", "Barvinok wrappers are not installed. " "Counting routines have misestimated the " "number of integer points in your loop "