From 5318f5ff097d45f080e44e96b798903697c6f1cf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sat, 21 May 2016 19:08:30 +0200 Subject: [PATCH] get_barrier_poly -> get_synchronization_poly --- doc/tutorial.rst | 35 ++++++++++++------------ loopy/__init__.py | 5 ++-- loopy/statistics.py | 59 +++++++++++++++++++++++++++++------------ test/test_statistics.py | 49 +++++++++++++++++----------------- 4 files changed, 88 insertions(+), 60 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 9d525605b..b7be43f7e 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1440,26 +1440,28 @@ elements in memory. The total number of array accesses has not changed: f64 load: 131072 f64 store: 65536 -Counting barriers -~~~~~~~~~~~~~~~~~ +Counting synchronization events +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_barrier_poly` counts the number of barriers per **thread** in a -kernel. First, we'll call this function on the kernel from the previous example: +:func:`loopy.get_synchronization_poly` counts the number of synchronization +events per **thread** in a kernel. First, we'll call this function on the +kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_barrier_poly - >>> barrier_poly = get_barrier_poly(knl) - >>> print("Barrier polynomial: %s" % barrier_poly) - Barrier polynomial: { 0 } + >>> from loopy.statistics import get_synchronization_poly + >>> barrier_poly = get_synchronization_poly(knl) + >>> print(lp.stringify_stats_mapping(barrier_poly)) + kernel_launch : { 1 } + <BLANKLINE> We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> barrier_count = barrier_poly.eval_with_dict(param_dict) - >>> print("Barrier count: %s" % barrier_count) - Barrier count: 0 + >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict) + >>> print("Kernel launch count: %s" % launch_count) + Kernel launch count: 1 Now to make things more interesting, we'll create a kernel with barriers: @@ -1505,12 +1507,11 @@ using :func:`loopy.get_barrier_poly`: .. doctest:: - >>> barrier_poly = get_barrier_poly(knl) - >>> barrier_count = barrier_poly.eval_with_dict({}) - >>> print("Barrier polynomial: %s\nBarrier count: %i" % - ... (barrier_poly, barrier_count)) - Barrier polynomial: { 1000 } - Barrier count: 1000 + >>> sync_map = lp.get_synchronization_poly(knl) + >>> print(lp.stringify_stats_mapping(sync_map)) + barrier_local : { 1000 } + kernel_launch : { 1 } + <BLANKLINE> Based on the kernel code printed above, we would expect each thread to encounter 50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In diff --git a/loopy/__init__.py b/loopy/__init__.py index 8562df411..fc3fb208a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -107,7 +107,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, get_gmem_access_poly, - get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping, + get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping, sum_mem_access_to_bytes, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -205,7 +205,8 @@ __all__ = [ "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", "get_DRAM_access_poly", - "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes", + "get_synchronization_poly", "stringify_stats_mapping", + "sum_mem_access_to_bytes", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 1a044b8d7..8a31f67fb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -45,7 +45,7 @@ __doc__ = """ .. autofunction:: sum_mem_access_to_bytes -.. autofunction:: get_barrier_poly +.. autofunction:: get_synchronization_poly .. autofunction:: gather_access_footprints .. autofunction:: gather_access_footprint_bytes @@ -793,16 +793,21 @@ def sum_mem_access_to_bytes(m): # }}} -# {{{ get_barrier_poly +# {{{ get_synchronization_poly -def get_barrier_poly(knl): +def get_synchronization_poly(knl): - """Count the number of barriers each thread encounters in a loopy kernel. + """Count the number of synchronization events each thread encounters in a + loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. - :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls - made (in terms of the :class:`loopy.LoopKernel` *inames*). + :return: A dictionary mapping each type of synchronization event to a + :class:`islpy.PwQPolynomial` holding the number of such events + per thread. + + Possible keys include ``barrier_local``, ``barrier_global`` + (if supported by the target) and ``kernel_launch``. Example usage:: @@ -817,13 +822,27 @@ def get_barrier_poly(knl): """ from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import EnterLoop, LeaveLoop, Barrier + from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, + CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] - barrier_poly = isl.PwQPolynomial('{ 0 }') + + result = ToCountMap() + + one = isl.PwQPolynomial('{ 1 }') + + def get_count_poly(iname_list): + if iname_list: # (if iname_list is not empty) + ct = (count(knl, ( + knl.get_inames_domain(iname_list). + project_out_except(iname_list, [dim_type.set]) + )), ) + return reduce(mul, ct) + else: + return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): @@ -832,17 +851,23 @@ def get_barrier_poly(knl): elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() + elif isinstance(sched_item, Barrier): - if iname_list: # (if iname_list is not empty) - ct = (count(knl, ( - knl.get_inames_domain(iname_list). - project_out_except(iname_list, [dim_type.set]) - )), ) - barrier_poly += reduce(mul, ct) - else: - barrier_poly += isl.PwQPolynomial('{ 1 }') + result = result + ToCountMap( + {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) + + elif isinstance(sched_item, CallKernel): + result = result + ToCountMap( + {"kernel_launch": get_count_poly(iname_list)}) + + elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): + pass + + else: + raise LoopyError("unexpected schedule item: %s" + % type(sched_item).__name__) - return barrier_poly + return result.dict # }}} diff --git a/test/test_statistics.py b/test/test_statistics.py index 2cf537f5e..4bcacf59e 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,7 +28,6 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp -from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly import numpy as np @@ -46,7 +45,7 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_op_poly(knl) + poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 @@ -71,7 +70,7 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = get_op_poly(knl) + poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 @@ -93,7 +92,7 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = get_op_poly(knl) + poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 @@ -122,7 +121,7 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_op_poly(knl) + poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 @@ -158,7 +157,7 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - poly = get_op_poly(knl) + poly = lp.get_op_poly(knl) n = 512 m = 256 l = 128 @@ -197,7 +196,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] + poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -221,7 +220,7 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -255,7 +254,7 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -283,7 +282,7 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -317,7 +316,7 @@ def test_gmem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -358,7 +357,7 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -392,7 +391,7 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = get_gmem_access_poly(knl) # noqa + poly = lp.get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -436,7 +435,7 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = get_gmem_access_poly(knl) # noqa + poly = lp.get_gmem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -475,7 +474,7 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = get_gmem_access_poly(knl) + poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 @@ -514,13 +513,13 @@ def test_barrier_counter_nobarriers(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = get_barrier_poly(knl) + sync_poly = lp.get_synchronization_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - barrier_count = poly.eval_with_dict(params) - assert barrier_count == 0 + assert len(sync_poly) == 1 + assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 def test_barrier_counter_barriers(): @@ -540,12 +539,13 @@ def test_barrier_counter_barriers(): ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") - poly = get_barrier_poly(knl) + poly = lp.get_synchronization_poly(knl) + print(poly) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - barrier_count = poly.eval_with_dict(params) + barrier_count = poly["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -566,10 +566,11 @@ def test_all_counters_parallel_matmul(): l = 128 params = {'n': n, 'm': m, 'l': l} - barrier_count = get_barrier_poly(knl).eval_with_dict(params) - assert barrier_count == 0 + sync_poly = lp.get_synchronization_poly(knl) + assert len(sync_poly) == 1 + assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 - op_map = get_op_poly(knl) + op_map = lp.get_op_poly(knl) f32mul = op_map[ (np.dtype(np.float32), 'mul') ].eval_with_dict(params) @@ -586,7 +587,7 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 - subscript_map = get_gmem_access_poly(knl) + subscript_map = lp.get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) -- GitLab