From dd31cce5d48e11eec96ef27d5902139311dbe4af Mon Sep 17 00:00:00 2001 From: James Stevens <jdsteve2@illinois.edu> Date: Mon, 20 Jul 2015 15:11:39 -0500 Subject: [PATCH] finished operation and subscript counting portions of tutorial --- doc/tutorial.rst | 103 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 10 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 71dad63e0..5dcffbafb 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1245,31 +1245,114 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: Counting array accesses ~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_DRAM_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example. +:func:`loopy.get_DRAM_access_poly` provides information on the number and type of array loads and stores being performed in a kernel. To demonstrate this, we'll continue using the kernel from the previous example: + +.. doctest:: + + >>> from loopy.statistics import get_DRAM_access_poly + >>> load_store_map = get_DRAM_access_poly(knl) + >>> for key in load_store_map.dict.keys(): + ... print("%s : %s" % (key, load_store_map.dict[key])) + (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } :func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The :class:`numpy.dtype` specifies the type of the data being accessed. -- The first string in the map key specifies the DRAM access type as *consecutive*, *nonconsecutive*, or *uniform*. +- The first string in the map key specifies the DRAM access type as *consecutive*, *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when consecutive threads access consecutive array elements in memory, *nonconsecutive* accesses occur when consecutive threads access nonconsecutive array elements in memory, and *uniform* accesses occur when consecutive threads access the *same* element in memory. - The second string in the map key specifies the DRAM access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). -We will call :func:`loopy.get_DRAM_access_poly` on our example kernel now: +We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> from loopy.statistics import get_DRAM_access_poly + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "uniform", "load")].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "uniform", "store")].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "uniform", "load")].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "uniform", "store")].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 - >>> load_store_map = get_DRAM_access_poly(knl) +~~~~~~~~~~~ + +Since we have not tagged any of the inames or parallelized the kernel across threads (which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers the array accesses *uniform*. Now we'll parallelize the kernel and count the array accesses again: + +.. doctest:: + + >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") + >>> load_store_map = get_DRAM_access_poly(knl_consec) >>> for key in load_store_map.dict.keys(): - ... print("%s : %s" % (key, load_store_map.dict[key])) - (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } - (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } - (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } + ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + (dtype('float32'), 'consecutive', 'load') : + [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float64'), 'consecutive', 'store') : + [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float64'), 'consecutive', 'load') : + [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float32'), 'consecutive', 'store') : + [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + +With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the parallelization, but when we evaluate them, we see that the total number of array accesses has not changed: + +.. doctest:: + + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "consecutive", "load")].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "consecutive", "store")].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "consecutive", "load")].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "consecutive", "store")].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 + +~~~~~~~~~~~ + +To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in our parallization of the kernel: + +.. doctest:: + + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") + >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) + >>> for key in load_store_map.dict.keys(): + ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + (dtype('float32'), 'nonconsecutive', 'store') : + [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float64'), 'nonconsecutive', 'load') : + [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float64'), 'nonconsecutive', 'store') : + [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + + (dtype('float32'), 'nonconsecutive', 'load') : + [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + +With this parallelization, consecutive threads will access *nonconsecutive* array elements in memory. The total number of array accesses has not changed: + +.. doctest:: + + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "nonconsecutive", "load")].eval_with_dict(param_dict) + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "nonconsecutive", "store")].eval_with_dict(param_dict) + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "nonconsecutive", "load")].eval_with_dict(param_dict) + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "nonconsecutive", "store")].eval_with_dict(param_dict) + >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % (f32ld, f32st, f64ld, f64st)) + f32 load: 1572864 + f32 store: 524288 + f64 load: 131072 + f64 store: 65536 .. }}} -- GitLab