From ad0f6d13cc622018247db6df75bc70d4b301ab7e Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 8 Sep 2018 22:16:26 -0500 Subject: [PATCH 01/20] counting madds --- loopy/statistics.py | 44 ++++++++++-- test/test_statistics.py | 148 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 178 insertions(+), 14 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3fecfb778..e16461874 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -723,12 +723,44 @@ class ExpressionOpCounter(CounterBase): def map_sum(self, expr): assert expr.children - return ToCountMap( - {Op(dtype=self.type_inf(expr), - name='add', - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + + # count children that are products + from pymbolic.primitives import Product, is_zero + # subtraction/negation yields a product that we don't want to count (-1)*x + # however we DO want to count the second multiplication in something like (-1)*x*3.14 + child_mul_ct = sum(isinstance(child, Product) and + (len(child.children) - sum(is_zero(grandchild + 1) for grandchild in child.children) > 1) + for child in expr.children) + + # compute add and madd counts + add_ct = len(expr.children)-1 + madd_ct = min(add_ct, child_mul_ct) + add_ct = add_ct - madd_ct + + # construct ToCountMap, start with recursing on children + result = sum(self.rec(child) for child in expr.children) + + if add_ct: + result += ToCountMap( + {Op(dtype=self.type_inf(expr), + name='add', + count_granularity=CountGranularity.SUBGROUP): add_ct}) + + if madd_ct: + # muls that are part of madds are counted as muls when recursing on children + # update mul count based on madd count + # this may yield 0 count for mul, which will be removed later + result += ToCountMap( + {Op(dtype=self.type_inf(expr), + name='madd', + count_granularity=CountGranularity.SUBGROUP): madd_ct} + ) + ToCountMap( + {Op(dtype=self.type_inf(expr), + name='mul', + count_granularity=CountGranularity.SUBGROUP): -madd_ct} + ) + + return result def map_product(self, expr): from pymbolic.primitives import is_zero diff --git a/test/test_statistics.py b/test/test_statistics.py index 3f2366521..86fb20f63 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -98,15 +98,13 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) - ].eval_with_dict(params) + f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert f32add == f32mul == n*m*ell*n_subgroups + assert f32madd == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) - assert f32 == f32add + f32mul + assert f32 == f32madd def test_op_counter_logic(): @@ -176,6 +174,8 @@ def test_op_counter_specialops(): f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) ].eval_with_dict(params) + f64madd = op_map[lp.Op(np.dtype(np.float64), 'madd', CG.SUBGROUP) + ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) @@ -185,7 +185,8 @@ def test_op_counter_specialops(): # (count-per-sub-group)*n_subgroups assert f32div == 2*n*m*ell*n_subgroups assert f32mul == f32add == n*m*ell*n_subgroups - assert f64add == 3*n*m*n_subgroups + assert f64add == 2*n*m*n_subgroups + assert f64madd == n*m*n_subgroups assert f64pow == i32add == f64rsq == f64sin == n*m*n_subgroups @@ -229,10 +230,137 @@ def test_op_counter_bitwise(): assert i32add == n*m+n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups - assert i64add == i64mul == n*m*n_subgroups + #assert i64add == i64mul == n*m*n_subgroups assert i64shift == 2*n*m*n_subgroups +def test_op_counter_madd(): + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group + n = 512 + m = 256 + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} + + knl = lp.make_kernel( + "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j,k]: 0<=i Date: Sat, 8 Sep 2018 23:27:18 -0500 Subject: [PATCH 02/20] removing map entries where the count is zero --- loopy/statistics.py | 20 +++++++++++++++----- test/test_statistics.py | 25 ++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index e16461874..afebdb233 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -110,6 +110,9 @@ class GuardedPwQPolynomial(object): p = isl.PwQPolynomial('{ 0 }') return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) + def is_zero(self): + return self.pwqpolynomial.is_zero() + def __str__(self): return str(self.pwqpolynomial) @@ -256,11 +259,11 @@ class ToCountMap(object): def filter_by_func(self, func): """Keep items that pass a test. - :arg func: A function that takes a map key a parameter and returns a - :class:`bool`. + :arg func: A function that takes a map key and val as parameters and + returns a :class:`bool`. :arg: A :class:`ToCountMap` containing the subset of the items in the - original :class:`ToCountMap` for which func(key) is true. + original :class:`ToCountMap` for which func(key, val) is true. Example usage:: @@ -268,7 +271,7 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_map(knl) - def filter_func(key): + def filter_func(key, val): return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4: filtered_map = mem_map.filter_by_func(filter_func) @@ -282,7 +285,7 @@ class ToCountMap(object): # for each item in self.count_map, call func on the key for self_key, self_val in self.items(): - if func(self_key): + if func(self_key, self_val): result_map[self_key] = self_val return result_map @@ -401,6 +404,7 @@ class ToCountMap(object): return result + def sum(self): """Add all counts in ToCountMap. @@ -1437,6 +1441,12 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, raise NotImplementedError("unexpected instruction item type: '%s'" % type(insn).__name__) + # remove zero values created by op counter in map_product and map_sum + def val_is_not_zero(key, val): + return not val.is_zero() + + op_map = op_map.filter_by_func(val_is_not_zero) + if numpy_types: return ToCountMap( init_dict=dict( diff --git a/test/test_statistics.py b/test/test_statistics.py index 86fb20f63..44cfc21f5 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -401,6 +401,29 @@ def test_op_counter_triangular_domain(): assert flops == 78*n_subgroups +def test_zero_count_removal(): + + knl = lp.make_kernel( + "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i Date: Sat, 8 Sep 2018 23:28:11 -0500 Subject: [PATCH 03/20] fixed documentation typo --- loopy/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index afebdb233..228cbe2fe 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -262,7 +262,7 @@ class ToCountMap(object): :arg func: A function that takes a map key and val as parameters and returns a :class:`bool`. - :arg: A :class:`ToCountMap` containing the subset of the items in the + :return: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` for which func(key, val) is true. Example usage:: -- GitLab From 8afba2554e3365e90a8f523805163fac1ed396dd Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 8 Sep 2018 23:35:49 -0500 Subject: [PATCH 04/20] simpler solution to removing ops from map when their count is zero --- loopy/statistics.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 228cbe2fe..55e501daa 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -110,9 +110,6 @@ class GuardedPwQPolynomial(object): p = isl.PwQPolynomial('{ 0 }') return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) - def is_zero(self): - return self.pwqpolynomial.is_zero() - def __str__(self): return str(self.pwqpolynomial) @@ -1428,12 +1425,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) for key, val in six.iteritems(ops.count_map): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + if val != 0: + op_map = ( + op_map + + ToCountMap({key: val}) + * _get_insn_count(knl, insn.id, subgroup_size, + count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1441,12 +1439,6 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, raise NotImplementedError("unexpected instruction item type: '%s'" % type(insn).__name__) - # remove zero values created by op counter in map_product and map_sum - def val_is_not_zero(key, val): - return not val.is_zero() - - op_map = op_map.filter_by_func(val_is_not_zero) - if numpy_types: return ToCountMap( init_dict=dict( -- GitLab From 8f84742018c7ec38975e68b2675e1f07e039daf0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Sep 2018 00:02:08 -0500 Subject: [PATCH 05/20] made madd counting optional --- loopy/statistics.py | 19 +++++++++++--- test/test_statistics.py | 55 ++++++++++++++++++++++++++++------------- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 55e501daa..a07ff95c8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -698,8 +698,9 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl): + def __init__(self, knl, count_madds): self.knl = knl + self.count_madds = count_madds from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) @@ -725,6 +726,15 @@ class ExpressionOpCounter(CounterBase): def map_sum(self, expr): assert expr.children + if not self.count_madds: + return ToCountMap( + {Op(dtype=self.type_inf(expr), + name='add', + count_granularity=CountGranularity.SUBGROUP): + len(expr.children) - 1} + ) + self.rec(expr.children) + + # count children that are products from pymbolic.primitives import Product, is_zero # subtraction/negation yields a product that we don't want to count (-1)*x @@ -1347,7 +1357,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map def get_op_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + subgroup_size=None, count_madds=False): """Count the number of operations in a loopy kernel. @@ -1375,6 +1385,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. + :arg count_madds: A :class:`bool` determining whether to count + multiplication followed by addition as a single operation. + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1415,7 +1428,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, knl = preprocess_kernel(knl) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) + op_counter = ExpressionOpCounter(knl, count_madds) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, diff --git a/test/test_statistics.py b/test/test_statistics.py index 44cfc21f5..c50c50fcd 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -89,7 +89,8 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_madds=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -159,7 +160,8 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_madds=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -207,7 +209,8 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_madds=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -230,7 +233,7 @@ def test_op_counter_bitwise(): assert i32add == n*m+n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups - #assert i64add == i64mul == n*m*n_subgroups + assert i64add == i64mul == n*m*n_subgroups assert i64shift == 2*n*m*n_subgroups @@ -250,7 +253,8 @@ def test_op_counter_madd(): name="basic", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_madds=True) f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) @@ -258,13 +262,23 @@ def test_op_counter_madd(): assert f32add == f32mul == 0 assert f32madd == n*m*n_subgroups + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_madds=False) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) + # (count-per-sub-group)*n_subgroups + assert f32add == f32mul == n*m*n_subgroups + assert f32madd == 0 + knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i Date: Sun, 9 Sep 2018 00:06:28 -0500 Subject: [PATCH 06/20] updated documentation --- doc/tutorial.rst | 2 +- loopy/statistics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1272d2a59..5056ee89a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1827,7 +1827,7 @@ criteria are more complicated than a simple list of allowable values: .. doctest:: - >>> def f(key): + >>> def f(key, val): ... from loopy.types import to_loopy_type ... return key.dtype == to_loopy_type(np.float32) and \ ... key.lid_strides[0] > 1 diff --git a/loopy/statistics.py b/loopy/statistics.py index a07ff95c8..b78982083 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -499,7 +499,7 @@ class Op(Record): .. attribute:: name A :class:`str` that specifies the kind of arithmetic operation as - *add*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + *add*, *mul*, *madd*, *div*, *pow*, *shift*, *bw* (bitwise), etc. .. attribute:: count_granularity -- GitLab From 266651a7e41db50a4fbc6cde5429dbc8f94d109a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Sep 2018 00:14:41 -0500 Subject: [PATCH 07/20] fixed formatting issues --- loopy/statistics.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b78982083..b4e0a571f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -401,7 +401,6 @@ class ToCountMap(object): return result - def sum(self): """Add all counts in ToCountMap. @@ -734,13 +733,13 @@ class ExpressionOpCounter(CounterBase): len(expr.children) - 1} ) + self.rec(expr.children) - # count children that are products from pymbolic.primitives import Product, is_zero # subtraction/negation yields a product that we don't want to count (-1)*x - # however we DO want to count the second multiplication in something like (-1)*x*3.14 + # however we DO want to count the second multiplication in, e.g., (-1)*x*3.14 child_mul_ct = sum(isinstance(child, Product) and - (len(child.children) - sum(is_zero(grandchild + 1) for grandchild in child.children) > 1) + (len(child.children) - sum(is_zero(grandchild + 1) + for grandchild in child.children) > 1) for child in expr.children) # compute add and madd counts @@ -758,7 +757,7 @@ class ExpressionOpCounter(CounterBase): count_granularity=CountGranularity.SUBGROUP): add_ct}) if madd_ct: - # muls that are part of madds are counted as muls when recursing on children + # muls that are part of madds are counted as muls when recursing # update mul count based on madd count # this may yield 0 count for mul, which will be removed later result += ToCountMap( -- GitLab From 2b53dbd8d27fb968465acbd0a5c16e8d299597b8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Sep 2018 00:44:33 -0500 Subject: [PATCH 08/20] fixed counting typo/bug --- loopy/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b4e0a571f..f19077ac5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -731,7 +731,7 @@ class ExpressionOpCounter(CounterBase): name='add', count_granularity=CountGranularity.SUBGROUP): len(expr.children) - 1} - ) + self.rec(expr.children) + ) + sum(self.rec(child) for child in expr.children) # count children that are products from pymbolic.primitives import Product, is_zero -- GitLab From fa05709e81efa659d21c507507c6dec91ea647d1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 11 Sep 2018 20:58:40 -0500 Subject: [PATCH 09/20] eliminated need to subtract muls in madd counting by processing child products within map_addition and recursing on grandchildren --- loopy/statistics.py | 60 ++++++++++++++++++++++++++++++++++++ test/test_statistics.py | 68 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f19077ac5..45a0eb5b9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -735,6 +735,8 @@ class ExpressionOpCounter(CounterBase): # count children that are products from pymbolic.primitives import Product, is_zero + + """ # subtraction/negation yields a product that we don't want to count (-1)*x # however we DO want to count the second multiplication in, e.g., (-1)*x*3.14 child_mul_ct = sum(isinstance(child, Product) and @@ -769,6 +771,64 @@ class ExpressionOpCounter(CounterBase): name='mul', count_granularity=CountGranularity.SUBGROUP): -madd_ct} ) + """ + + # construct ToCountMap + result = ToCountMap() + + adds_available = len(expr.children)-1 + madd_ct = 0 + child_mul_count = 0 + for child in expr.children: + if isinstance(child, Product) and adds_available > madd_ct: + # process this product as in map_product, + # but first check to see if one mul can be counted as a madd + + # count legitimate muls (i.e., not (-1)*x + child_muls_available = len(child.children) - 1 - sum( + is_zero(grandchild + 1) for grandchild in child.children) + + if child_muls_available: + # TODO: could insert madd op each time here but faster to + # count them and do single op insert at end? + + madd_ct += 1 # count one mul as madd + + # are there remaining muls to be counted? + if child_muls_available > 1: + # TODO: could insert mul op each time here but faster to + # count them and do single op insert at end? + + # count remaining muls as muls + child_mul_count += child_muls_available - 1 + + # recurse on grandchildren that are not (-1), as in map_product + result += sum(self.rec(grandchild) for grandchild in child.children + if not is_zero(child + 1)) + + else: + # either non-product or no adds available for madd, recurse as usual + result += self.rec(child) + + # only insert ops if the value is non-zero + if adds_available > madd_ct: + result += ToCountMap( + {Op(dtype=self.type_inf(expr), + name='add', + count_granularity=CountGranularity.SUBGROUP): + adds_available - madd_ct}) + if madd_ct: + result += ToCountMap( + {Op(dtype=self.type_inf(expr), + name='madd', + count_granularity=CountGranularity.SUBGROUP): + madd_ct}) + if child_mul_count: + result += ToCountMap( + {Op(dtype=self.type_inf(expr), + name='mul', + count_granularity=CountGranularity.SUBGROUP): + child_mul_count}) return result diff --git a/test/test_statistics.py b/test/test_statistics.py index c50c50fcd..3bc48e0fc 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -288,7 +288,55 @@ def test_op_counter_madd(): knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i Date: Wed, 14 Nov 2018 19:43:44 -0600 Subject: [PATCH 10/20] WIP: added TODO, need to get data type correct in madds, mul might have different type than add --- loopy/statistics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 45a0eb5b9..baa71f2d3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -779,6 +779,7 @@ class ExpressionOpCounter(CounterBase): adds_available = len(expr.children)-1 madd_ct = 0 child_mul_count = 0 + sum_expr_type = self.type_inf(expr) # TODO figure out how to handle dtype correctly for child in expr.children: if isinstance(child, Product) and adds_available > madd_ct: # process this product as in map_product, -- GitLab From 346cf4664e16a8174e89553234725328fd6817e9 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 14 Nov 2018 19:59:14 -0600 Subject: [PATCH 11/20] WIP: temporary fix for madd dtype checking, not sure if it works yet, need tests --- loopy/statistics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index baa71f2d3..d0ecaafb3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -781,7 +781,9 @@ class ExpressionOpCounter(CounterBase): child_mul_count = 0 sum_expr_type = self.type_inf(expr) # TODO figure out how to handle dtype correctly for child in expr.children: - if isinstance(child, Product) and adds_available > madd_ct: + #child_expr_type = self.type_inf(child.expr) + child_expr_type = self.type_inf(child) + if isinstance(child, Product) and child_expr_type == sum_expr_type and adds_available > madd_ct: # process this product as in map_product, # but first check to see if one mul can be counted as a madd -- GitLab From 5a330251071cee10397d1e36cd998b268894c7cb Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Dec 2018 19:22:36 -0600 Subject: [PATCH 12/20] added test for type mismatch between mul and add --- test/test_statistics.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index c620927d4..e435e6da7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -333,7 +333,6 @@ def test_op_counter_madd(): # (count-per-sub-group)*n_subgroups assert f32mul == 0 assert f32add == n*m*n_subgroups - # ------------------------------------------------------------------------- knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i Date: Sun, 9 Dec 2018 19:26:44 -0600 Subject: [PATCH 13/20] reordered madd test lines for consistency --- test/test_statistics.py | 50 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index e435e6da7..9867cc8c3 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -256,8 +256,8 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == 0 @@ -265,11 +265,11 @@ def test_op_counter_madd(): op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=False) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert f32add == f32mul == n*m*n_subgroups + assert f32mul == f32add == n*m*n_subgroups assert f32madd == 0 knl = lp.make_kernel( @@ -280,11 +280,11 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert f32add == f32mul == 0 + assert f32mul == f32add == 0 assert f32madd == n*m*n_subgroups knl = lp.make_kernel( @@ -295,8 +295,8 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == 0 @@ -311,8 +311,8 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == 0 @@ -327,8 +327,8 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == 0 @@ -342,13 +342,13 @@ def test_op_counter_madd(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_madds=True) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32madd = op_map[lp.Op(np.float32, 'madd', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups + assert f32mul == 3*n*m*n_subgroups assert f32add == 0 assert f32madd == n*m*n_subgroups - assert f32mul == 3*n*m*n_subgroups knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i Date: Sun, 9 Dec 2018 19:27:57 -0600 Subject: [PATCH 14/20] renamed madd test knls --- test/test_statistics.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 9867cc8c3..7b7679b99 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -251,7 +251,7 @@ def test_op_counter_madd(): knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j,k]: 0<=i {[i,j]: 0<=i Date: Sun, 9 Dec 2018 19:56:16 -0600 Subject: [PATCH 15/20] organized and documented madd test cases --- test/test_statistics.py | 50 +++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 7b7679b99..85e1cd849 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -248,12 +248,16 @@ def test_op_counter_madd(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} + # ------------------------------------------------------------------------------- + # standard madd counting knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j,k]: 0<=i {[i,j]: 0<=i Date: Sun, 9 Dec 2018 21:13:50 -0600 Subject: [PATCH 16/20] more comments in madd code/test for clarity; removed old (commented out) madd code --- loopy/statistics.py | 75 +++++++++-------------------------------- test/test_statistics.py | 2 ++ 2 files changed, 18 insertions(+), 59 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d7ccc9117..29a2c5169 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -746,87 +746,44 @@ class ExpressionOpCounter(CounterBase): len(expr.children) - 1} ) + sum(self.rec(child) for child in expr.children) - # count children that are products - from pymbolic.primitives import Product, is_zero - - """ - # subtraction/negation yields a product that we don't want to count (-1)*x - # however we DO want to count the second multiplication in, e.g., (-1)*x*3.14 - child_mul_ct = sum(isinstance(child, Product) and - (len(child.children) - sum(is_zero(grandchild + 1) - for grandchild in child.children) > 1) - for child in expr.children) - - # compute add and madd counts - add_ct = len(expr.children)-1 - madd_ct = min(add_ct, child_mul_ct) - add_ct = add_ct - madd_ct - - # construct ToCountMap, start with recursing on children - result = sum(self.rec(child) for child in expr.children) - - if add_ct: - result += ToCountMap( - {Op(dtype=self.type_inf(expr), - name='add', - count_granularity=CountGranularity.SUBGROUP): add_ct}) - - if madd_ct: - # muls that are part of madds are counted as muls when recursing - # update mul count based on madd count - # this may yield 0 count for mul, which will be removed later - result += ToCountMap( - {Op(dtype=self.type_inf(expr), - name='madd', - count_granularity=CountGranularity.SUBGROUP): madd_ct} - ) + ToCountMap( - {Op(dtype=self.type_inf(expr), - name='mul', - count_granularity=CountGranularity.SUBGROUP): -madd_ct} - ) - """ - # construct ToCountMap result = ToCountMap() + # compute count for madds, adds, and muls within expr and expr.children, + # and recurse on any uncounted expressions adds_available = len(expr.children)-1 madd_ct = 0 child_mul_count = 0 - sum_expr_type = self.type_inf(expr) # TODO figure out how to handle dtype correctly + sum_expr_type = self.type_inf(expr) + from pymbolic.primitives import Product, is_zero + for child in expr.children: - #child_expr_type = self.type_inf(child.expr) - child_expr_type = self.type_inf(child) - if isinstance(child, Product) and child_expr_type == sum_expr_type and adds_available > madd_ct: - # process this product as in map_product, + # if child is Product w/ matching dtype and unpaired adds exist, + # then consider it as potential source for madds, + # otherwise recurse on child as usual + if isinstance(child, Product) and self.type_inf(child) == sum_expr_type \ + and adds_available > madd_ct: + # process this product as in map_product(), # but first check to see if one mul can be counted as a madd - # count legitimate muls (i.e., not (-1)*x + # count muls excluding negation + # i.e., (-1)*x contains 0 muls; (-1)*x*y contains one mul child_muls_available = len(child.children) - 1 - sum( is_zero(grandchild + 1) for grandchild in child.children) if child_muls_available: - # TODO: could insert madd op each time here but faster to - # count them and do single op insert at end? - madd_ct += 1 # count one mul as madd - - # are there remaining muls to be counted? + # if there are remaining muls, count them if child_muls_available > 1: - # TODO: could insert mul op each time here but faster to - # count them and do single op insert at end? - - # count remaining muls as muls child_mul_count += child_muls_available - 1 # recurse on grandchildren that are not (-1), as in map_product result += sum(self.rec(grandchild) for grandchild in child.children if not is_zero(child + 1)) - else: - # either non-product or no adds available for madd, recurse as usual - result += self.rec(child) + result += self.rec(child) # recurse as usual - # only insert ops if the value is non-zero + # only insert ops if the count is non-zero if adds_available > madd_ct: result += ToCountMap( {Op(dtype=self.type_inf(expr), diff --git a/test/test_statistics.py b/test/test_statistics.py index 85e1cd849..1475ef106 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -410,6 +410,8 @@ def test_op_counter_madd(): assert f32madd == n*m*n_subgroups # negation beginning string of muls: + # subtraction/negation yields a product that we don't want to count (-1)*x + # however we DO want to count the second multiplication in, e.g., (-1)*x*3.14 knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i Date: Sun, 9 Dec 2018 21:15:37 -0600 Subject: [PATCH 17/20] reusing sum_expr_type variable --- loopy/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 29a2c5169..59e7f353a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -786,19 +786,19 @@ class ExpressionOpCounter(CounterBase): # only insert ops if the count is non-zero if adds_available > madd_ct: result += ToCountMap( - {Op(dtype=self.type_inf(expr), + {Op(dtype=sum_expr_type, name='add', count_granularity=CountGranularity.SUBGROUP): adds_available - madd_ct}) if madd_ct: result += ToCountMap( - {Op(dtype=self.type_inf(expr), + {Op(dtype=sum_expr_type, name='madd', count_granularity=CountGranularity.SUBGROUP): madd_ct}) if child_mul_count: result += ToCountMap( - {Op(dtype=self.type_inf(expr), + {Op(dtype=sum_expr_type, name='mul', count_granularity=CountGranularity.SUBGROUP): child_mul_count}) -- GitLab From 72d0aebd2f543b7494c9b06b98430238a92ca58d Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Dec 2018 21:18:18 -0600 Subject: [PATCH 18/20] removed assertion --- loopy/statistics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 59e7f353a..4afd2d443 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -736,8 +736,6 @@ class ExpressionOpCounter(CounterBase): return ToCountMap() def map_sum(self, expr): - assert expr.children - if not self.count_madds: return ToCountMap( {Op(dtype=self.type_inf(expr), @@ -783,7 +781,7 @@ class ExpressionOpCounter(CounterBase): else: result += self.rec(child) # recurse as usual - # only insert ops if the count is non-zero + # only insert ops if count is non-zero if adds_available > madd_ct: result += ToCountMap( {Op(dtype=sum_expr_type, -- GitLab From 7f47b2f81a0afb44a004f772d24450a2abb73ea1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 10 Dec 2018 18:17:00 -0600 Subject: [PATCH 19/20] clearer madd comments, one more test --- loopy/statistics.py | 6 +++--- test/test_statistics.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 4afd2d443..407b2cffa 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -747,7 +747,7 @@ class ExpressionOpCounter(CounterBase): # construct ToCountMap result = ToCountMap() - # compute count for madds, adds, and muls within expr and expr.children, + # first compute count for madds, adds, and muls in expr and expr.children, # and recurse on any uncounted expressions adds_available = len(expr.children)-1 madd_ct = 0 @@ -778,10 +778,10 @@ class ExpressionOpCounter(CounterBase): # recurse on grandchildren that are not (-1), as in map_product result += sum(self.rec(grandchild) for grandchild in child.children if not is_zero(child + 1)) - else: + else: # not madd result += self.rec(child) # recurse as usual - # only insert ops if count is non-zero + # second, insert op for madds, adds, and muls if count is non-zero if adds_available > madd_ct: result += ToCountMap( {Op(dtype=sum_expr_type, diff --git a/test/test_statistics.py b/test/test_statistics.py index 1475ef106..0b7007d66 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -250,6 +250,8 @@ def test_op_counter_madd(): # ------------------------------------------------------------------------------- # standard madd counting + + # perform madd knl = lp.make_kernel( "[n,m] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i Date: Mon, 10 Dec 2018 18:41:56 -0600 Subject: [PATCH 20/20] renamed child_mul_count->child_mul_ct for consistency --- loopy/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 407b2cffa..3ef90c4ef 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -751,7 +751,7 @@ class ExpressionOpCounter(CounterBase): # and recurse on any uncounted expressions adds_available = len(expr.children)-1 madd_ct = 0 - child_mul_count = 0 + child_mul_ct = 0 sum_expr_type = self.type_inf(expr) from pymbolic.primitives import Product, is_zero @@ -773,7 +773,7 @@ class ExpressionOpCounter(CounterBase): madd_ct += 1 # count one mul as madd # if there are remaining muls, count them if child_muls_available > 1: - child_mul_count += child_muls_available - 1 + child_mul_ct += child_muls_available - 1 # recurse on grandchildren that are not (-1), as in map_product result += sum(self.rec(grandchild) for grandchild in child.children @@ -794,12 +794,12 @@ class ExpressionOpCounter(CounterBase): name='madd', count_granularity=CountGranularity.SUBGROUP): madd_ct}) - if child_mul_count: + if child_mul_ct: result += ToCountMap( {Op(dtype=sum_expr_type, name='mul', count_granularity=CountGranularity.SUBGROUP): - child_mul_count}) + child_mul_ct}) return result -- GitLab