From 2b2972ecab7df36c121dad611a95404a101a818e Mon Sep 17 00:00:00 2001 From: James Stevens <jdsteve2@illinois.edu> Date: Fri, 24 Jul 2015 21:20:31 -0500 Subject: [PATCH] minor changes --- doc/tutorial.rst | 145 +++++++++++++++++++------------------------- loopy/statistics.py | 44 +++++++------- 2 files changed, 84 insertions(+), 105 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index be732e30d..4b4ce4109 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1186,15 +1186,14 @@ TODO Gathering kernel statistics --------------------------- -Operations, array access, and barriers can all be counted, which may -facilitate performance prediction and optimization of a :mod:`loopy` -kernel. +Operations, array access, and barriers can all be counted, which may facilitate +performance prediction and optimization of a :mod:`loopy` kernel. .. note:: - The functions used in the following examples may produce warnings. If - you have already made the filterwarnings and catch_warnings calls used - in the examples above, you may need to reset these before continuing: + The functions used in the following examples may produce warnings. If you have + already made the filterwarnings and catch_warnings calls used in the examples + above, you may need to reset these before continuing: .. doctest:: @@ -1204,10 +1203,9 @@ kernel. Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_poly` provides information on the number and type of -operations being performed in a kernel. To demonstrate this, we'll create -an example kernel that performs several operations on arrays containing -different types of data: +:func:`loopy.get_op_poly` provides information on the number and type of operations +being performed in a kernel. To demonstrate this, we'll create an example kernel +that performs several operations on arrays containing different types of data: .. doctest:: @@ -1228,11 +1226,10 @@ information provided. Now we will count the operations: >>> from loopy.statistics import get_op_poly >>> op_map = get_op_poly(knl) -:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` -**:** :class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` -holds the number of operations for the :class:`numpy.dtype` specified in -the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print -this map now: +:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** +:class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the +number of operations for the :class:`numpy.dtype` specified in the key (in terms of +the :class:`loopy.LoopKernel` *inames*). We'll print this map now: .. doctest:: @@ -1247,12 +1244,9 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> i32ops = op_map.dict[ - ... np.dtype(np.int32)].eval_with_dict(param_dict) - >>> f32ops = op_map.dict[ - ... np.dtype(np.float32)].eval_with_dict(param_dict) - >>> f64ops = op_map.dict[ - ... np.dtype(np.float64)].eval_with_dict(param_dict) + >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict) + >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict) + >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict) >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % ... (i32ops, f32ops, f64ops)) integer ops: 65536 @@ -1262,9 +1256,9 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: Counting array accesses ~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_DRAM_access_poly` provides information on the number and -type of array loads and stores being performed in a kernel. To demonstrate -this, we'll continue using the kernel from the previous example: +:func:`loopy.get_DRAM_access_poly` provides information on the number and type of +array loads and stores being performed in a kernel. To demonstrate this, we'll +continue using the kernel from the previous example: .. doctest:: @@ -1283,35 +1277,31 @@ this, we'll continue using the kernel from the previous example: - The :class:`numpy.dtype` specifies the type of the data being accessed. -- The first string in the map key specifies the DRAM access type as - *consecutive*, *nonconsecutive*, or *uniform*. *Consecutive* memory - accesses occur when consecutive threads access consecutive array elements - in memory, *nonconsecutive* accesses occur when consecutive threads - access nonconsecutive array elements in memory, and *uniform* accesses - occur when consecutive threads access the *same* element in memory. +- The first string in the map key specifies the DRAM access type as *consecutive*, + *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when + consecutive threads access consecutive array elements in memory, *nonconsecutive* + accesses occur when consecutive threads access nonconsecutive array elements in + memory, and *uniform* accesses occur when consecutive threads access the *same* + element in memory. -- The second string in the map key specifies the DRAM access type as a - *load*, or a *store*. +- The second string in the map key specifies the DRAM access type as a *load*, or a + *store*. -- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with - the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). +- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the + characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` + *inames*). We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map.dict[ - ... (np.dtype(np.float64), "uniform", "load") + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "uniform", "load") ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map.dict[ - ... (np.dtype(np.float64), "uniform", "store") + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "uniform", "store") ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map.dict[ - ... (np.dtype(np.float32), "uniform", "load") + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "uniform", "load") ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map.dict[ - ... (np.dtype(np.float32), "uniform", "store") + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "uniform", "store") ... ].eval_with_dict(param_dict) >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % ... (f32ld, f32st, f64ld, f64st)) @@ -1322,17 +1312,15 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: ~~~~~~~~~~~ -Since we have not tagged any of the inames or parallelized the kernel -across threads (which would have produced iname tags), -:func:`loopy.get_DRAM_access_poly` considers the array accesses *uniform*. -Now we'll parallelize the kernel and count the array accesses again. The -resulting :class:`islpy.PwQPolynomial` will be more complicated this time, -so we'll print the mapping manually to make it more legible: +Since we have not tagged any of the inames or parallelized the kernel across threads +(which would have produced iname tags), :func:`loopy.get_DRAM_access_poly` considers +the array accesses *uniform*. Now we'll parallelize the kernel and count the array +accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated +this time, so we'll print the mapping manually to make it more legible: .. doctest:: - >>> knl_consec = lp.split_iname(knl, "k", 128, - ... outer_tag="l.1", inner_tag="l.0") + >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") >>> load_store_map = get_DRAM_access_poly(knl_consec) >>> for key in load_store_map.dict.keys(): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) @@ -1349,24 +1337,20 @@ so we'll print the mapping manually to make it more legible: [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } <BLANKLINE> -With this parallelization, consecutive threads will access consecutive -array elements in memory. The polynomials are a bit more complicated now -due to the parallelization, but when we evaluate them, we see that the -total number of array accesses has not changed: +With this parallelization, consecutive threads will access consecutive array +elements in memory. The polynomials are a bit more complicated now due to the +parallelization, but when we evaluate them, we see that the total number of array +accesses has not changed: .. doctest:: - >>> f64ld = load_store_map.dict[ - ... (np.dtype(np.float64), "consecutive", "load") + >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "consecutive", "load") ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map.dict[ - ... (np.dtype(np.float64), "consecutive", "store") + >>> f64st = load_store_map.dict[(np.dtype(np.float64), "consecutive", "store") ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map.dict[ - ... (np.dtype(np.float32), "consecutive", "load") + >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "consecutive", "load") ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map.dict[ - ... (np.dtype(np.float32), "consecutive", "store") + >>> f32st = load_store_map.dict[(np.dtype(np.float32), "consecutive", "store") ... ].eval_with_dict(param_dict) >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % ... (f32ld, f32st, f64ld, f64st)) @@ -1377,13 +1361,12 @@ total number of array accesses has not changed: ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses, we'll switch the inner and -outer tags in our parallelization of the kernel: +To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in +our parallelization of the kernel: .. doctest:: - >>> knl_nonconsec = lp.split_iname(knl, "k", 128, - ... outer_tag="l.0", inner_tag="l.1") + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) >>> for key in load_store_map.dict.keys(): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) @@ -1400,9 +1383,8 @@ outer tags in our parallelization of the kernel: [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } <BLANKLINE> -With this parallelization, consecutive threads will access *nonconsecutive* -array elements in memory. The total number of array accesses has not -changed: +With this parallelization, consecutive threads will access *nonconsecutive* array +elements in memory. The total number of array accesses has not changed: .. doctest:: @@ -1428,8 +1410,8 @@ changed: Counting barriers ~~~~~~~~~~~~~~~~~ -:func:`loopy.get_barrier_poly` counts the number of barriers in a kernel. -First, we'll call this function on the kernel from the previous example: +:func:`loopy.get_barrier_poly` counts the number of barriers per **thread** in a +kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: @@ -1462,8 +1444,7 @@ Now to make things more interesting, we'll create a kernel with barriers: ... "..." ... ]) >>> knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) - >>> knl = lp.split_iname(knl, "k", 128, - ... outer_tag="g.0", inner_tag="l.0") + >>> knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") >>> code, _ = lp.generate_code(lp.preprocess_kernel(knl)) >>> print(code) #define lid(N) ((int) get_local_id(N)) @@ -1484,10 +1465,10 @@ Now to make things more interesting, we'll create a kernel with barriers: } -In this kernel, when a thread performs the second instruction it uses data -produced by *different* threads during the first instruction. For correct -execution barriers are required, so loopy inserts them. Now we'll count the -barriers using :func:`loopy.get_barrier_poly`: +In this kernel, when a thread performs the second instruction it uses data produced +by *different* threads during the first instruction. For correct execution barriers +are required, so loopy inserts them. Now we'll count the barriers using +:func:`loopy.get_barrier_poly`: .. doctest:: @@ -1498,10 +1479,10 @@ barriers using :func:`loopy.get_barrier_poly`: Barrier polynomial: { 1000 } Barrier count: 1000 -Based on the kernel code printed above, we would expect to find 50x10x2 -barriers, and we do. In this case, the number of barriers does not depend -on any inames, so we can pass an empty dictionary to -:func:`islpy.eval_with_dict`. +Based on the kernel code printed above, we would expect each thread to encounter +50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In +this case, the number of barriers does not depend on any inames, so we can pass an +empty dictionary to :func:`islpy.eval_with_dict`. .. }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index 52c6b5eeb..041720153 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -412,12 +412,12 @@ def get_op_poly(knl): :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :return: A mapping of **{** :class:`numpy.dtype` \ - **:** :class:`islpy.PwQPolynomial` **}**. + :return: A mapping of **{** :class:`numpy.dtype` **:** + :class:`islpy.PwQPolynomial` **}**. - - The :class:`islpy.PwQPolynomial` holds the number of operations for \ - the :class:`numpy.dtype` specified in the key (in terms of \ - the :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of operations for + the :class:`numpy.dtype` specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -453,25 +453,24 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts """Count the number of DRAM accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` \ - whose DRAM accesses are to be counted. + :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be + counted. - :return: A mapping of **{(** \ - :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` \ - **)** **:** :class:`islpy.PwQPolynomial` **}**. + :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** + :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - - The :class:`numpy.dtype` specifies \ - the type of the data being accessed. + - The :class:`numpy.dtype` specifies the type of the data being + accessed. - - The first string in the map key specifies the DRAM access type as \ - *consecutive*, *nonconsecutive*, or *uniform*. + - The first string in the map key specifies the DRAM access type as + *consecutive*, *nonconsecutive*, or *uniform*. - - The second string in the map key specifies the DRAM access type as \ - a *load*, or a *store*. + - The second string in the map key specifies the DRAM access type as a + *load*, or a *store*. - - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses \ - with the characteristics specified in the key (in terms of the \ - :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -520,12 +519,11 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts def get_barrier_poly(knl): - """Count the number of barriers in a loopy kernel. + """Count the number of barriers each thread encounters in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` \ - whose barriers are to be counted. + :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. - :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls \ + :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls made (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: -- GitLab