diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 9d525605b7a41c39400cd4d4beb7e64e38d766f2..b7be43f7e283bc464c88a6cdb49141964f9a8f1c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1440,26 +1440,28 @@ elements in memory. The total number of array accesses has not changed:
     f64 load: 131072
     f64 store: 65536
 
-Counting barriers
-~~~~~~~~~~~~~~~~~
+Counting synchronization events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_barrier_poly` counts the number of barriers per **thread** in a
-kernel. First, we'll call this function on the kernel from the previous example:
+:func:`loopy.get_synchronization_poly` counts the number of synchronization
+events per **thread** in a kernel. First, we'll call this function on the
+kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_barrier_poly
-    >>> barrier_poly = get_barrier_poly(knl)
-    >>> print("Barrier polynomial: %s" % barrier_poly)
-    Barrier polynomial: { 0 }
+    >>> from loopy.statistics import get_synchronization_poly
+    >>> barrier_poly = get_synchronization_poly(knl)
+    >>> print(lp.stringify_stats_mapping(barrier_poly))
+    kernel_launch : { 1 }
+    <BLANKLINE>
 
 We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> barrier_count = barrier_poly.eval_with_dict(param_dict)
-    >>> print("Barrier count: %s" % barrier_count)
-    Barrier count: 0
+    >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
+    >>> print("Kernel launch count: %s" % launch_count)
+    Kernel launch count: 1
 
 Now to make things more interesting, we'll create a kernel with barriers:
 
@@ -1505,12 +1507,11 @@ using :func:`loopy.get_barrier_poly`:
 
 .. doctest::
 
-    >>> barrier_poly = get_barrier_poly(knl)
-    >>> barrier_count = barrier_poly.eval_with_dict({})
-    >>> print("Barrier polynomial: %s\nBarrier count: %i" %
-    ...     (barrier_poly, barrier_count))
-    Barrier polynomial: { 1000 }
-    Barrier count: 1000
+    >>> sync_map = lp.get_synchronization_poly(knl)
+    >>> print(lp.stringify_stats_mapping(sync_map))
+    barrier_local : { 1000 }
+    kernel_launch : { 1 }
+    <BLANKLINE>
 
 Based on the kernel code printed above, we would expect each thread to encounter
 50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8562df4118534392cf66ada250aacdfdbc0e1917..fc3fb208a1a867a886bd63e9169f7f7a4b4522e2 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -107,7 +107,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
         get_gmem_access_poly,
-        get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
+        get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
         sum_mem_access_to_bytes,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -205,7 +205,8 @@ __all__ = [
 
         "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
         "get_DRAM_access_poly",
-        "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
+        "get_synchronization_poly", "stringify_stats_mapping",
+        "sum_mem_access_to_bytes",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 1a044b8d7f389fbb8e58d3fa597932f8c56ac97f..8a31f67fbf85bf8a370f17d2775569697f247502 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -45,7 +45,7 @@ __doc__ = """
 
 .. autofunction:: sum_mem_access_to_bytes
 
-.. autofunction:: get_barrier_poly
+.. autofunction:: get_synchronization_poly
 
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
@@ -793,16 +793,21 @@ def sum_mem_access_to_bytes(m):
 # }}}
 
 
-# {{{ get_barrier_poly
+# {{{ get_synchronization_poly
 
-def get_barrier_poly(knl):
+def get_synchronization_poly(knl):
 
-    """Count the number of barriers each thread encounters in a loopy kernel.
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
-    :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls
-             made (in terms of the :class:`loopy.LoopKernel` *inames*).
+    :return: A dictionary mapping each type of synchronization event to a
+            :class:`islpy.PwQPolynomial` holding the number of such events
+            per thread.
+
+            Possible keys include ``barrier_local``, ``barrier_global``
+            (if supported by the target) and ``kernel_launch``.
 
     Example usage::
 
@@ -817,13 +822,27 @@ def get_barrier_poly(knl):
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier
+    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
+            CallKernel, ReturnFromKernel, RunInstruction)
     from operator import mul
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
     knl = lp.get_one_scheduled_kernel(knl)
     iname_list = []
-    barrier_poly = isl.PwQPolynomial('{ 0 }')
+
+    result = ToCountMap()
+
+    one = isl.PwQPolynomial('{ 1 }')
+
+    def get_count_poly(iname_list):
+        if iname_list:  # (if iname_list is not empty)
+            ct = (count(knl, (
+                            knl.get_inames_domain(iname_list).
+                            project_out_except(iname_list, [dim_type.set])
+                            )), )
+            return reduce(mul, ct)
+        else:
+            return one
 
     for sched_item in knl.schedule:
         if isinstance(sched_item, EnterLoop):
@@ -832,17 +851,23 @@ def get_barrier_poly(knl):
         elif isinstance(sched_item, LeaveLoop):
             if sched_item.iname:  # (if not empty)
                 iname_list.pop()
+
         elif isinstance(sched_item, Barrier):
-            if iname_list:  # (if iname_list is not empty)
-                ct = (count(knl, (
-                                knl.get_inames_domain(iname_list).
-                                project_out_except(iname_list, [dim_type.set])
-                                )), )
-                barrier_poly += reduce(mul, ct)
-            else:
-                barrier_poly += isl.PwQPolynomial('{ 1 }')
+            result = result + ToCountMap(
+                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})
+
+        elif isinstance(sched_item, CallKernel):
+            result = result + ToCountMap(
+                    {"kernel_launch": get_count_poly(iname_list)})
+
+        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
+            pass
+
+        else:
+            raise LoopyError("unexpected schedule item: %s"
+                    % type(sched_item).__name__)
 
-    return barrier_poly
+    return result.dict
 
 # }}}
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 2cf537f5ed9c039d09cb1d10066ec9294898d9b9..4bcacf59eb5752800cee2322844d6e33968dc91e 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,7 +28,6 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
-from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
 import numpy as np
 
 
@@ -46,7 +45,7 @@ def test_op_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -71,7 +70,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -93,7 +92,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -122,7 +121,7 @@ def test_op_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -158,7 +157,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = get_op_poly(knl)
+    poly = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -197,7 +196,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -221,7 +220,7 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -255,7 +254,7 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -283,7 +282,7 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -317,7 +316,7 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -358,7 +357,7 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -392,7 +391,7 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -436,7 +435,7 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_gmem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -475,7 +474,7 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = get_gmem_access_poly(knl)
+    poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -514,13 +513,13 @@ def test_barrier_counter_nobarriers():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_barrier_poly(knl)
+    sync_poly = lp.get_synchronization_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly.eval_with_dict(params)
-    assert barrier_count == 0
+    assert len(sync_poly) == 1
+    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
 
 
 def test_barrier_counter_barriers():
@@ -540,12 +539,13 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
-    poly = get_barrier_poly(knl)
+    poly = lp.get_synchronization_poly(knl)
+    print(poly)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly.eval_with_dict(params)
+    barrier_count = poly["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
 
@@ -566,10 +566,11 @@ def test_all_counters_parallel_matmul():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    barrier_count = get_barrier_poly(knl).eval_with_dict(params)
-    assert barrier_count == 0
+    sync_poly = lp.get_synchronization_poly(knl)
+    assert len(sync_poly) == 1
+    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
 
-    op_map = get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     f32mul = op_map[
                         (np.dtype(np.float32), 'mul')
                         ].eval_with_dict(params)
@@ -586,7 +587,7 @@ def test_all_counters_parallel_matmul():
     assert f32mul+f32add == n*m*l*2
     assert i32ops == n*m*l*4 + l*n*4
 
-    subscript_map = get_gmem_access_poly(knl)
+    subscript_map = lp.get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
                         (np.dtype(np.float32), 'nonconsecutive', 'load')
                         ].eval_with_dict(params)