diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 153c66f90e315427128d3e0ffda983f630f90977..53938cba6da79d46c02ab4206a4712fa920509cb 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -199,7 +199,7 @@ For convenience, loopy kernels also directly accept :mod:`numpy` arrays: >>> evt, (out,) = knl(queue, a=x_vec_host) >>> assert (out == (2*x_vec_host)).all() -Notice how both *out* nor *a* are :mod:`numpy` arrays, but neither needed +Notice how both *out* and *a* are :mod:`numpy` arrays, but neither needed to be transferred to or from the device. Checking for numpy arrays and transferring them if needed comes at a potential performance cost. If you would like to make sure that you avoid this cost, pass @@ -1186,7 +1186,7 @@ across the remaining axis of the workgroup would emerge. TODO -Gathering kernel statistics +Obtaining Kernel Statistics --------------------------- Operations, array access, and barriers can all be counted, which may facilitate @@ -1229,17 +1229,21 @@ information provided. Now we will count the operations: >>> from loopy.statistics import get_op_poly >>> op_map = get_op_poly(knl) -:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:** -:class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the -number of operations for the :class:`numpy.dtype` specified in the key (in terms of -the :class:`loopy.LoopKernel` *inames*). We'll print this map now: +:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** +:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The +:class:`islpy.PwQPolynomial` holds the number of operations for the type specified +in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this +map now: .. doctest:: >>> print(lp.stringify_stats_mapping(op_map)) - float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } - float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } - int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } + (dtype('int32'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } <BLANKLINE> We can evaluate these polynomials using :func:`islpy.eval_with_dict`: @@ -1247,14 +1251,20 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> i32ops = op_map[np.dtype(np.int32)].eval_with_dict(param_dict) - >>> f32ops = op_map[np.dtype(np.float32)].eval_with_dict(param_dict) - >>> f64ops = op_map[np.dtype(np.float64)].eval_with_dict(param_dict) - >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % - ... (i32ops, f32ops, f64ops)) - integer ops: 65536 - float32 ops: 1572864 - float64 ops: 131072 + >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict) + >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict) + >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict) + >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict) + >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict) + >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict) + >>> print("%i\n%i\n%i\n%i\n%i\n%i" % + ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) + 524288 + 524288 + 524288 + 65536 + 65536 + 65536 Counting array accesses ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1471,9 +1481,9 @@ Now to make things more interesting, we'll create a kernel with barriers: In this kernel, when a thread performs the second instruction it uses data produced -by *different* threads during the first instruction. For correct execution barriers -are required, so loopy inserts them. Now we'll count the barriers using -:func:`loopy.get_barrier_poly`: +by *different* threads during the first instruction. Because of this, barriers are +required for correct execution, so loopy inserts them. Now we'll count the barriers +using :func:`loopy.get_barrier_poly`: .. doctest:: diff --git a/loopy/statistics.py b/loopy/statistics.py index 57a8a83b55896b06ff322efe7d19e07c0121667d..834f482072a51386460e09d6c4f4d6a4406fa56a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -38,15 +38,12 @@ class ToCountMap: def __init__(self, init_dict=None): if init_dict is None: init_dict = {} - self.dict = init_dict def __add__(self, other): result = self.dict.copy() - for k, v in six.iteritems(other.dict): result[k] = self.dict.get(k, 0) + v - return ToCountMap(result) def __radd__(self, other): @@ -55,7 +52,6 @@ class ToCountMap: "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) - return self def __mul__(self, other): @@ -109,7 +105,8 @@ class ExpressionOpCounter(CombineMapper): #def map_function_symbol(self, expr): # return 0,0 - map_call = map_constant + def map_call(self, expr): + return self.rec(expr.parameters) # def map_call_with_kwargs(self, expr): # implemented in CombineMapper @@ -119,66 +116,53 @@ class ExpressionOpCounter(CombineMapper): # def map_lookup(self, expr): # implemented in CombineMapper def map_sum(self, expr): - if expr.children: - return ToCountMap( - {self.type_inf(expr): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() + assert expr.children + return ToCountMap( + {(self.type_inf(expr), 'add'): len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero - if expr.children: - return sum(ToCountMap({self.type_inf(expr): 1}) + self.rec(child) - for child in expr.children - # Do not count '(-1)* ' (as produced by - # subtraction in pymbolic): Assume this - # gets implemented as a sign flip or - # as subtraction. (Confirmed to be true on - # at least Nvidia 352.30.) - if not is_zero(child + 1)) + \ - ToCountMap({self.type_inf(expr): -1}) - - else: - return ToCountMap() + assert expr.children + return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1}) + + self.rec(child) + for child in expr.children + if not is_zero(child + 1)) + \ + ToCountMap({(self.type_inf(expr), 'mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({self.type_inf(expr): 1}) \ + return ToCountMap({(self.type_inf(expr), 'div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) map_floor_div = map_quotient - map_remainder = map_quotient # implemented in CombineMapper + map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({self.type_inf(expr): 1}) \ + return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) - def map_left_shift(self, expr): # implemented in CombineMapper - return ToCountMap({self.type_inf(expr): 1}) \ + def map_left_shift(self, expr): + return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift - def map_bitwise_not(self, expr): # implemented in CombineMapper - return ToCountMap({self.type_inf(expr): 1}) \ + def map_bitwise_not(self, expr): + return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - # implemented in CombineMapper, maps to map_sum; return ToCountMap( - {self.type_inf(expr): len(expr.children)-1} + {(self.type_inf(expr), 'bw'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or - # implemented in CombineMapper, maps to map_sum; - map_bitwise_and = map_bitwise_or - # implemented in CombineMapper, maps to map_sum; - def map_comparison(self, expr): # implemented in CombineMapper + def map_comparison(self, expr): return self.rec(expr.left)+self.rec(expr.right) def map_logical_not(self, expr): @@ -189,20 +173,22 @@ class ExpressionOpCounter(CombineMapper): map_logical_and = map_logical_or - def map_if(self, expr): # implemented in CombineMapper, recurses - warnings.warn("ExpressionOpCounter counting DRAM accesses as " + def map_if(self, expr): + warnings.warn("ExpressionOpCounter counting ops as " "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) - def map_if_positive(self, expr): # implemented in FlopCounter - warnings.warn("ExpressionOpCounter counting DRAM accesses as " + def map_if_positive(self, expr): + warnings.warn("ExpressionOpCounter counting ops as " "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) - map_min = map_bitwise_or - # implemented in CombineMapper, maps to map_sum; # TODO test + def map_min(self, expr): + return ToCountMap( + {(self.type_inf(expr), 'maxmin'): len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) - map_max = map_min # implemented in CombineMapper, maps to map_sum; # TODO test + map_max = map_min def map_common_subexpression(self, expr): raise NotImplementedError("ExpressionOpCounter encountered " @@ -237,7 +223,9 @@ class GlobalSubscriptCounter(CombineMapper): map_tagged_variable = map_constant map_variable = map_constant - map_call = map_constant + + def map_call(self, expr): + return self.rec(expr.parameters) def map_subscript(self, expr): name = expr.aggregate.name # name of array @@ -360,12 +348,12 @@ class GlobalSubscriptCounter(CombineMapper): map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("GlobalSubscriptCounter counting DRAM accesses as " + warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) @@ -428,11 +416,17 @@ def get_op_poly(knl): :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :return: A mapping of **{** :class:`numpy.dtype` **:** - :class:`islpy.PwQPolynomial` **}**. + :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **)** + **:** :class:`islpy.PwQPolynomial` **}**. + + - The :class:`numpy.dtype` specifies the type of the data being + operated on. - - The :class:`islpy.PwQPolynomial` holds the number of operations for - the :class:`numpy.dtype` specified in the key (in terms of the + - The string specifies the operation type as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + + - The :class:`islpy.PwQPolynomial` holds the number of operations of + the kind specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -441,8 +435,8 @@ def get_op_poly(knl): poly = get_op_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(params) - float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(params) + f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -452,7 +446,7 @@ def get_op_poly(knl): knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - op_poly = 0 + op_poly = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? @@ -466,6 +460,7 @@ def get_op_poly(knl): def get_gmem_access_poly(knl): # for now just counting subscripts + """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be @@ -514,7 +509,7 @@ def get_gmem_access_poly(knl): # for now just counting subscripts knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_poly = 0 + subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: insn_inames = knl.insn_inames(insn) @@ -590,7 +585,7 @@ def get_barrier_poly(knl): knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] - barrier_poly = isl.PwQPolynomial('{ 0 }') # 0 + barrier_poly = isl.PwQPolynomial('{ 0 }') for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): @@ -610,3 +605,4 @@ def get_barrier_poly(knl): barrier_poly += isl.PwQPolynomial('{ 1 }') return barrier_poly + diff --git a/test/test_statistics.py b/test/test_statistics.py index a504761193fe4acb7dff9a4a9535efb7a74fe2a9..0dffe5c3575237cab8f518ba95a33f74a3bbe840 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -38,7 +38,7 @@ def test_op_counter_basic(): [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] - e[i, k+1] = g[i,k]*h[i,k+1] + e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") @@ -49,12 +49,15 @@ def test_op_counter_basic(): n = 512 m = 256 l = 128 - f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - assert f32 == 3*n*m*l - assert f64 == n*m - assert i32 == n*m*2 + params = {'n': n, 'm': m, 'l': l} + f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) + f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + assert f32add == f32mul == f32div == n*m*l + assert f64mul == n*m + assert i32add == n*m*2 def test_op_counter_reduction(): @@ -71,8 +74,10 @@ def test_op_counter_reduction(): n = 512 m = 256 l = 128 - f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - assert f32 == 2*n*m*l + params = {'n': n, 'm': m, 'l': l} + f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + assert f32add == f32mul == n*m*l def test_op_counter_logic(): @@ -91,12 +96,15 @@ def test_op_counter_logic(): n = 512 m = 256 l = 128 - f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - assert f32 == n*m - assert f64 == 3*n*m - assert i32 == n*m + params = {'n': n, 'm': m, 'l': l} + f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) + f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + assert f32mul == n*m + assert f64div == 2*n*m # TODO why? + assert f64add == n*m + assert i32add == n*m def test_op_counter_specialops(): @@ -117,12 +125,17 @@ def test_op_counter_specialops(): n = 512 m = 256 l = 128 - f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - assert f32 == 4*n*m*l - assert f64 == 3*n*m - assert i32 == n*m + params = {'n': n, 'm': m, 'l': l} + f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) + f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) + f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) + f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + assert f32div == 2*n*m*l + assert f32mul == f32add == n*m*l + assert f64add == 2*n*m + assert f64pow == i32add == n*m def test_op_counter_bitwise(): @@ -146,11 +159,18 @@ def test_op_counter_bitwise(): n = 512 m = 256 l = 128 - i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i64 = poly[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l}) # noqa - assert np.dtype(np.float64) not in poly - assert i32 == n*m+3*n*m*l - assert i64 == 6*n*m + params = {'n': n, 'm': m, 'l': l} + i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) + i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) + assert i32add == n*m+n*m*l + assert i32bw == 2*n*m*l + assert i64bw == 2*n*m + assert i64add == i64mul == n*m + assert i64shift == 2*n*m def test_op_counter_triangular_domain(): @@ -174,7 +194,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = get_op_poly(knl)[np.dtype(np.float64)] + poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -202,21 +222,22 @@ def test_gmem_access_counter_basic(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -235,14 +256,15 @@ def test_gmem_access_counter_reduction(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == 2*n*m*l f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == n*l @@ -262,18 +284,19 @@ def test_gmem_access_counter_logic(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64 == n*m @@ -295,21 +318,22 @@ def test_gmem_access_counter_specialops(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -335,14 +359,15 @@ def test_gmem_access_counter_bitwise(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} i32 = poly[ (np.dtype(np.int32), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l i32 = poly[ (np.dtype(np.int32), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -366,21 +391,22 @@ def test_gmem_access_counter_mixed(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f64uniform = poly[ (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32nonconsec == 3*n*m*l f64uniform = poly[ (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -405,21 +431,22 @@ def test_gmem_access_counter_nonconsec(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f64nonconsec = poly[ (np.dtype(np.float64), 'nonconsecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l f64nonconsec = poly[ (np.dtype(np.float64), 'nonconsecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -443,22 +470,23 @@ def test_gmem_access_counter_consec(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} f64consec = poly[ (np.dtype(np.float64), 'consecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32consec = poly[ (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l f64consec = poly[ (np.dtype(np.float64), 'consecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32consec = poly[ (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -481,7 +509,8 @@ def test_barrier_counter_nobarriers(): n = 512 m = 256 l = 128 - barrier_count = poly.eval_with_dict({'n': n, 'm': m, 'l': l}) + params = {'n': n, 'm': m, 'l': l} + barrier_count = poly.eval_with_dict(params) assert barrier_count == 0 @@ -506,7 +535,8 @@ def test_barrier_counter_barriers(): n = 512 m = 256 l = 128 - barrier_count = poly.eval_with_dict({'n': n, 'm': m, 'l': l}) + params = {'n': n, 'm': m, 'l': l} + barrier_count = poly.eval_with_dict(params) assert barrier_count == 50*10*2 @@ -525,35 +555,42 @@ def test_all_counters_parallel_matmul(): n = 512 m = 256 l = 128 + params = {'n': n, 'm': m, 'l': l} - barrier_count = get_barrier_poly(knl).eval_with_dict({'n': n, 'm': m, 'l': l}) + barrier_count = get_barrier_poly(knl).eval_with_dict(params) assert barrier_count == 0 op_map = get_op_poly(knl) - f32ops = op_map[ - np.dtype(np.float32) - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32mul = op_map[ + (np.dtype(np.float32), 'mul') + ].eval_with_dict(params) + f32add = op_map[ + (np.dtype(np.float32), 'add') + ].eval_with_dict(params) i32ops = op_map[ - np.dtype(np.int32) - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + (np.dtype(np.int32), 'add') + ].eval_with_dict(params) + i32ops += op_map[ + (np.dtype(np.int32), 'mul') + ].eval_with_dict(params) - assert f32ops == n*m*l*2 + assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict({'n': n, 'm': m, 'l': l}) + ].eval_with_dict(params) assert f32coal == n*l