diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 153c66f90e315427128d3e0ffda983f630f90977..53938cba6da79d46c02ab4206a4712fa920509cb 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -199,7 +199,7 @@ For convenience, loopy kernels also directly accept :mod:`numpy` arrays:
     >>> evt, (out,) = knl(queue, a=x_vec_host)
     >>> assert (out == (2*x_vec_host)).all()
-Notice how both *out* nor *a* are :mod:`numpy` arrays, but neither needed
+Notice how both *out* and *a* are :mod:`numpy` arrays, but neither needed
 to be transferred to or from the device.  Checking for numpy arrays and
 transferring them if needed comes at a potential performance cost.  If you
 would like to make sure that you avoid this cost, pass
@@ -1186,7 +1186,7 @@ across the remaining axis of the workgroup would emerge.
-Gathering kernel statistics
+Obtaining Kernel Statistics
 Operations, array access, and barriers can all be counted, which may facilitate
@@ -1229,17 +1229,21 @@ information provided. Now we will count the operations:
     >>> from loopy.statistics import get_op_poly
     >>> op_map = get_op_poly(knl)
-:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:**
-:class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the
-number of operations for the :class:`numpy.dtype` specified in the key (in terms of
-the :class:`loopy.LoopKernel` *inames*). We'll print this map now:
+:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** 
+:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The 
+:class:`islpy.PwQPolynomial` holds the number of operations for the type specified 
+in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this 
+map now:
 .. doctest::
     >>> print(lp.stringify_stats_mapping(op_map))
-    float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
-    float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
-    int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('int32'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
 We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
@@ -1247,14 +1251,20 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 .. doctest::
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> i32ops = op_map[np.dtype(np.int32)].eval_with_dict(param_dict)
-    >>> f32ops = op_map[np.dtype(np.float32)].eval_with_dict(param_dict)
-    >>> f64ops = op_map[np.dtype(np.float64)].eval_with_dict(param_dict)
-    >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" %
-    ...     (i32ops, f32ops, f64ops))
-    integer ops: 65536
-    float32 ops: 1572864
-    float64 ops: 131072
+    >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
+    >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
+    >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
+    >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
+    >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
+    ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
+    524288
+    524288
+    524288
+    65536
+    65536
+    65536
 Counting array accesses
@@ -1471,9 +1481,9 @@ Now to make things more interesting, we'll create a kernel with barriers:
 In this kernel, when a thread performs the second instruction it uses data produced
-by *different* threads during the first instruction. For correct execution barriers
-are required, so loopy inserts them. Now we'll count the barriers using
+by *different* threads during the first instruction. Because of this, barriers are
+required for correct execution, so loopy inserts them. Now we'll count the barriers
+using :func:`loopy.get_barrier_poly`:
 .. doctest::
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 57a8a83b55896b06ff322efe7d19e07c0121667d..834f482072a51386460e09d6c4f4d6a4406fa56a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,15 +38,12 @@ class ToCountMap:
     def __init__(self, init_dict=None):
         if init_dict is None:
             init_dict = {}
         self.dict = init_dict
     def __add__(self, other):
         result = self.dict.copy()
         for k, v in six.iteritems(other.dict):
             result[k] = self.dict.get(k, 0) + v
         return ToCountMap(result)
     def __radd__(self, other):
@@ -55,7 +52,6 @@ class ToCountMap:
                                 "to {} {}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
         return self
     def __mul__(self, other):
@@ -109,7 +105,8 @@ class ExpressionOpCounter(CombineMapper):
     #def map_function_symbol(self, expr):
     #    return 0,0
-    map_call = map_constant
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
@@ -119,66 +116,53 @@ class ExpressionOpCounter(CombineMapper):
     # def map_lookup(self, expr):  # implemented in CombineMapper
     def map_sum(self, expr):
-        if expr.children:
-            return ToCountMap(
-                        {self.type_inf(expr): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
+        assert expr.children
+        return ToCountMap(
+                    {(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    ) + sum(self.rec(child) for child in expr.children)
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
-        if expr.children:
-            return sum(ToCountMap({self.type_inf(expr): 1}) + self.rec(child)
-                       for child in expr.children
-                       # Do not count '(-1)* ' (as produced by
-                       # subtraction in pymbolic): Assume this
-                       # gets implemented as a sign flip or
-                       # as subtraction. (Confirmed to be true on
-                       # at least Nvidia 352.30.)
-                       if not is_zero(child + 1)) + \
-                       ToCountMap({self.type_inf(expr): -1})
-        else:
-            return ToCountMap()
+        assert expr.children
+        return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
+                   + self.rec(child)
+                   for child in expr.children
+                   if not is_zero(child + 1)) + \
+                   ToCountMap({(self.type_inf(expr), 'mul'): -1})
     def map_quotient(self, expr, *args):
-        return ToCountMap({self.type_inf(expr): 1}) \
+        return ToCountMap({(self.type_inf(expr), 'div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
     map_floor_div = map_quotient
-    map_remainder = map_quotient  # implemented in CombineMapper
+    map_remainder = map_quotient
     def map_power(self, expr):
-        return ToCountMap({self.type_inf(expr): 1}) \
+        return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
-    def map_left_shift(self, expr):  # implemented in CombineMapper
-        return ToCountMap({self.type_inf(expr): 1}) \
+    def map_left_shift(self, expr):
+        return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
     map_right_shift = map_left_shift
-    def map_bitwise_not(self, expr):  # implemented in CombineMapper
-        return ToCountMap({self.type_inf(expr): 1}) \
+    def map_bitwise_not(self, expr):
+        return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \
                                 + self.rec(expr.child)
     def map_bitwise_or(self, expr):
-        # implemented in CombineMapper, maps to map_sum;
         return ToCountMap(
-                        {self.type_inf(expr): len(expr.children)-1}
+                        {(self.type_inf(expr), 'bw'): len(expr.children)-1}
                         ) + sum(self.rec(child) for child in expr.children)
     map_bitwise_xor = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;
     map_bitwise_and = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;
-    def map_comparison(self, expr):  # implemented in CombineMapper
+    def map_comparison(self, expr):
         return self.rec(expr.left)+self.rec(expr.right)
     def map_logical_not(self, expr):
@@ -189,20 +173,22 @@ class ExpressionOpCounter(CombineMapper):
     map_logical_and = map_logical_or
-    def map_if(self, expr):  # implemented in CombineMapper, recurses
-        warnings.warn("ExpressionOpCounter counting DRAM accesses as "
+    def map_if(self, expr):
+        warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
-    def map_if_positive(self, expr):  # implemented in FlopCounter
-        warnings.warn("ExpressionOpCounter counting DRAM accesses as "
+    def map_if_positive(self, expr):
+        warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
-    map_min = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;  # TODO test
+    def map_min(self, expr):
+        return ToCountMap(
+                        {(self.type_inf(expr), 'maxmin'): len(expr.children)-1}
+                        ) + sum(self.rec(child) for child in expr.children)
-    map_max = map_min  # implemented in CombineMapper, maps to map_sum;  # TODO test
+    map_max = map_min
     def map_common_subexpression(self, expr):
         raise NotImplementedError("ExpressionOpCounter encountered "
@@ -237,7 +223,9 @@ class GlobalSubscriptCounter(CombineMapper):
     map_tagged_variable = map_constant
     map_variable = map_constant
-    map_call = map_constant
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
     def map_subscript(self, expr):
         name = expr.aggregate.name  # name of array
@@ -360,12 +348,12 @@ class GlobalSubscriptCounter(CombineMapper):
     map_logical_and = map_logical_or
     def map_if(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
     def map_if_positive(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
@@ -428,11 +416,17 @@ def get_op_poly(knl):
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
-    :return: A mapping of **{** :class:`numpy.dtype` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+    :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **)**
+             **:** :class:`islpy.PwQPolynomial` **}**.
+             - The :class:`numpy.dtype` specifies the type of the data being
+               operated on.
-             - The :class:`islpy.PwQPolynomial` holds the number of operations for
-               the :class:`numpy.dtype` specified in the key (in terms of the
+             - The string specifies the operation type as
+               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+             - The :class:`islpy.PwQPolynomial` holds the number of operations of
+               the kind specified in the key (in terms of the
                :class:`loopy.LoopKernel` *inames*).
     Example usage::
@@ -441,8 +435,8 @@ def get_op_poly(knl):
         poly = get_op_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        float32_op_ct = poly.dict[np.dtype(np.float32)].eval_with_dict(params)
-        float64_op_ct = poly.dict[np.dtype(np.float64)].eval_with_dict(params)
+        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
+        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
         # (now use these counts to predict performance)
@@ -452,7 +446,7 @@ def get_op_poly(knl):
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
-    op_poly = 0
+    op_poly = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         # how many times is this instruction executed?
@@ -466,6 +460,7 @@ def get_op_poly(knl):
 def get_gmem_access_poly(knl):  # for now just counting subscripts
     """Count the number of global memory accesses in a loopy kernel.
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -514,7 +509,7 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
-    subs_poly = 0
+    subs_poly = ToCountMap()
     subscript_counter = GlobalSubscriptCounter(knl)
     for insn in knl.instructions:
         insn_inames = knl.insn_inames(insn)
@@ -590,7 +585,7 @@ def get_barrier_poly(knl):
     knl = preprocess_kernel(knl)
     knl = lp.get_one_scheduled_kernel(knl)
     iname_list = []
-    barrier_poly = isl.PwQPolynomial('{ 0 }')  # 0
+    barrier_poly = isl.PwQPolynomial('{ 0 }')
     for sched_item in knl.schedule:
         if isinstance(sched_item, EnterLoop):
@@ -610,3 +605,4 @@ def get_barrier_poly(knl):
                 barrier_poly += isl.PwQPolynomial('{ 1 }')
     return barrier_poly
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a504761193fe4acb7dff9a4a9535efb7a74fe2a9..0dffe5c3575237cab8f518ba95a33f74a3bbe840 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -38,7 +38,7 @@ def test_op_counter_basic():
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
-                e[i, k+1] = g[i,k]*h[i,k+1]
+                e[i, k+1] = -g[i,k]*h[i,k+1]
             name="basic", assumptions="n,m,l >= 1")
@@ -49,12 +49,15 @@ def test_op_counter_basic():
     n = 512
     m = 256
     l = 128
-    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    assert f32 == 3*n*m*l
-    assert f64 == n*m
-    assert i32 == n*m*2
+    params = {'n': n, 'm': m, 'l': l}
+    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    assert f32add == f32mul == f32div == n*m*l
+    assert f64mul == n*m
+    assert i32add == n*m*2
 def test_op_counter_reduction():
@@ -71,8 +74,10 @@ def test_op_counter_reduction():
     n = 512
     m = 256
     l = 128
-    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    assert f32 == 2*n*m*l
+    params = {'n': n, 'm': m, 'l': l}
+    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    assert f32add == f32mul == n*m*l
 def test_op_counter_logic():
@@ -91,12 +96,15 @@ def test_op_counter_logic():
     n = 512
     m = 256
     l = 128
-    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    assert f32 == n*m
-    assert f64 == 3*n*m
-    assert i32 == n*m
+    params = {'n': n, 'm': m, 'l': l}
+    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    assert f32mul == n*m
+    assert f64div == 2*n*m  # TODO why?
+    assert f64add == n*m
+    assert i32add == n*m
 def test_op_counter_specialops():
@@ -117,12 +125,17 @@ def test_op_counter_specialops():
     n = 512
     m = 256
     l = 128
-    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    assert f32 == 4*n*m*l
-    assert f64 == 3*n*m
-    assert i32 == n*m
+    params = {'n': n, 'm': m, 'l': l}
+    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
+    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    assert f32div == 2*n*m*l
+    assert f32mul == f32add == n*m*l
+    assert f64add == 2*n*m
+    assert f64pow == i32add == n*m
 def test_op_counter_bitwise():
@@ -146,11 +159,18 @@ def test_op_counter_bitwise():
     n = 512
     m = 256
     l = 128
-    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i64 = poly[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l})  # noqa
-    assert np.dtype(np.float64) not in poly
-    assert i32 == n*m+3*n*m*l
-    assert i64 == 6*n*m
+    params = {'n': n, 'm': m, 'l': l}
+    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
+    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    assert i32add == n*m+n*m*l
+    assert i32bw == 2*n*m*l
+    assert i64bw == 2*n*m
+    assert i64add == i64mul == n*m
+    assert i64shift == 2*n*m
 def test_op_counter_triangular_domain():
@@ -174,7 +194,7 @@ def test_op_counter_triangular_domain():
         expect_fallback = False
-    poly = get_op_poly(knl)[np.dtype(np.float64)]
+    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
@@ -202,21 +222,22 @@ def test_gmem_access_counter_basic():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
-                   ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                   ].eval_with_dict(params)
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
-                   ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                   ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
-                   ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                   ].eval_with_dict(params)
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
-                   ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                   ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -235,14 +256,15 @@ def test_gmem_access_counter_reduction():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f32 == n*l
@@ -262,18 +284,19 @@ def test_gmem_access_counter_logic():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64 == n*m
@@ -295,21 +318,22 @@ def test_gmem_access_counter_specialops():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
     f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -335,14 +359,15 @@ def test_gmem_access_counter_bitwise():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     i32 = poly[
                     (np.dtype(np.int32), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
     i32 = poly[
                     (np.dtype(np.int32), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
@@ -366,21 +391,22 @@ def test_gmem_access_counter_mixed():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f64uniform = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32nonconsec == 3*n*m*l
     f64uniform = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
@@ -405,21 +431,22 @@ def test_gmem_access_counter_nonconsec():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f64nonconsec = poly[
                     (np.dtype(np.float64), 'nonconsecutive', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
     f64nonconsec = poly[
                     (np.dtype(np.float64), 'nonconsecutive', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
@@ -443,22 +470,23 @@ def test_gmem_access_counter_consec():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
     f64consec = poly[
                     (np.dtype(np.float64), 'consecutive', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32consec = poly[
                     (np.dtype(np.float32), 'consecutive', 'load')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
     f64consec = poly[
                     (np.dtype(np.float64), 'consecutive', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     f32consec = poly[
                     (np.dtype(np.float32), 'consecutive', 'store')
-                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                    ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
@@ -481,7 +509,8 @@ def test_barrier_counter_nobarriers():
     n = 512
     m = 256
     l = 128
-    barrier_count = poly.eval_with_dict({'n': n, 'm': m, 'l': l})
+    params = {'n': n, 'm': m, 'l': l}
+    barrier_count = poly.eval_with_dict(params)
     assert barrier_count == 0
@@ -506,7 +535,8 @@ def test_barrier_counter_barriers():
     n = 512
     m = 256
     l = 128
-    barrier_count = poly.eval_with_dict({'n': n, 'm': m, 'l': l})
+    params = {'n': n, 'm': m, 'l': l}
+    barrier_count = poly.eval_with_dict(params)
     assert barrier_count == 50*10*2
@@ -525,35 +555,42 @@ def test_all_counters_parallel_matmul():
     n = 512
     m = 256
     l = 128
+    params = {'n': n, 'm': m, 'l': l}
-    barrier_count = get_barrier_poly(knl).eval_with_dict({'n': n, 'm': m, 'l': l})
+    barrier_count = get_barrier_poly(knl).eval_with_dict(params)
     assert barrier_count == 0
     op_map = get_op_poly(knl)
-    f32ops = op_map[
-                        np.dtype(np.float32)
-                        ].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f32mul = op_map[
+                        (np.dtype(np.float32), 'mul')
+                        ].eval_with_dict(params)
+    f32add = op_map[
+                        (np.dtype(np.float32), 'add')
+                        ].eval_with_dict(params)
     i32ops = op_map[
-                        np.dtype(np.int32)
-                        ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                        (np.dtype(np.int32), 'add')
+                        ].eval_with_dict(params)
+    i32ops += op_map[
+                        (np.dtype(np.int32), 'mul')
+                        ].eval_with_dict(params)
-    assert f32ops == n*m*l*2
+    assert f32mul+f32add == n*m*l*2
     assert i32ops == n*m*l*4 + l*n*4
     subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
                         (np.dtype(np.float32), 'nonconsecutive', 'load')
-                        ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                        ].eval_with_dict(params)
     f32coal = subscript_map[
                         (np.dtype(np.float32), 'consecutive', 'load')
-                        ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                        ].eval_with_dict(params)
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
     f32coal = subscript_map[
                         (np.dtype(np.float32), 'consecutive', 'store')
-                        ].eval_with_dict({'n': n, 'm': m, 'l': l})
+                        ].eval_with_dict(params)
     assert f32coal == n*l