From 451fd1801c92e61ae61f68d1ccc4d495781f75cf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sun, 9 Aug 2015 13:46:02 -0500 Subject: [PATCH] Make output from stats doctests more deterministic --- doc/tutorial.rst | 34 ++++++++++++++++++---------------- loopy/statistics.py | 2 +- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4b4ce4109..13a87d72c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1234,9 +1234,9 @@ the :class:`loopy.LoopKernel` *inames*). We'll print this map now: .. doctest:: >>> print(op_map) + float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } - float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> We can evaluate these polynomials using :func:`islpy.eval_with_dict`: @@ -1265,10 +1265,10 @@ continue using the kernel from the previous example: >>> from loopy.statistics import get_DRAM_access_poly >>> load_store_map = get_DRAM_access_poly(knl) >>> print(load_store_map) + (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> :func:`loopy.get_DRAM_access_poly` returns a mapping of **{(** @@ -1322,21 +1322,22 @@ this time, so we'll print the mapping manually to make it more legible: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") >>> load_store_map = get_DRAM_access_poly(knl_consec) - >>> for key in load_store_map.dict.keys(): + >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) (dtype('float32'), 'consecutive', 'load') : - [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + [n, m, l] -> { (((192 * n + -3 * n * m) * l * floor((m)/128) + 192 * n * l * floor((m)/128)^2) + (192 * n + 3 * n * m) * l * floor((127 + m)/128) + -192 * n * l * floor((127 + m)/128)^2) : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> - (dtype('float64'), 'consecutive', 'store') : - [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + (dtype('float32'), 'consecutive', 'store') : + [n, m, l] -> { (((64 * n + -n * m) * l * floor((m)/128) + 64 * n * l * floor((m)/128)^2) + (64 * n + n * m) * l * floor((127 + m)/128) + -64 * n * l * floor((127 + m)/128)^2) : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> (dtype('float64'), 'consecutive', 'load') : - [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> - (dtype('float32'), 'consecutive', 'store') : - [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + (dtype('float64'), 'consecutive', 'store') : + [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> + With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the parallelization, but when we evaluate them, we see that the total number of array @@ -1368,21 +1369,22 @@ our parallelization of the kernel: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") >>> load_store_map = get_DRAM_access_poly(knl_nonconsec) - >>> for key in load_store_map.dict.keys(): + >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + (dtype('float32'), 'nonconsecutive', 'load') : + [n, m, l] -> { (((192 * n + -3 * n * m) * l * floor((m)/128) + 192 * n * l * floor((m)/128)^2) + (192 * n + 3 * n * m) * l * floor((127 + m)/128) + -192 * n * l * floor((127 + m)/128)^2) : n >= 1 and m >= 1 and l >= 1 } + <BLANKLINE> (dtype('float32'), 'nonconsecutive', 'store') : - [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + [n, m, l] -> { (((64 * n + -n * m) * l * floor((m)/128) + 64 * n * l * floor((m)/128)^2) + (64 * n + n * m) * l * floor((127 + m)/128) + -64 * n * l * floor((127 + m)/128)^2) : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> (dtype('float64'), 'nonconsecutive', 'load') : - [n, m, l] -> { (2 * n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (256 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> (dtype('float64'), 'nonconsecutive', 'store') : - [n, m, l] -> { (n * m * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (128 * n * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } - <BLANKLINE> - (dtype('float32'), 'nonconsecutive', 'load') : - [n, m, l] -> { (3 * n * m * l * floor((127 + m)/128)) : n >= 1 and m <= 127 and m >= 1 and l >= 1; (384 * n * l * floor((127 + m)/128)) : n >= 1 and m >= 128 and l >= 1 } + [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> + With this parallelization, consecutive threads will access *nonconsecutive* array elements in memory. The total number of array accesses has not changed: diff --git a/loopy/statistics.py b/loopy/statistics.py index 041720153..7281cd2b2 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -77,7 +77,7 @@ class ToCountMap: def __str__(self): result = "" - for key in self.dict.keys(): + for key in sorted(self.dict.keys(), key=lambda k: str(k)): result += ("%s : %s\n" % (key, self.dict[key])) return result -- GitLab