diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 29fab76e7af1d0a0dfa548a056a36273cf553b38..06756df56fb370ab0b8f05b0273510bbf739437b 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1243,26 +1243,30 @@ Obtaining Performance Statistics .. {{{ -Operations, array access, and barriers can all be counted, which may facilitate -performance prediction and optimization of a :mod:`loopy` kernel. +Arithmetic operations, array accesses, and synchronization operations can all +be counted, which may facilitate performance prediction and optimization of a +:mod:`loopy` kernel. .. note:: The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples - above, you may need to reset these before continuing: + above, you may want to reset these before continuing. We will temporarily + supress warnings to keep the output clean: .. doctest:: - >>> from warnings import resetwarnings + >>> from warnings import resetwarnings, filterwarnings >>> resetwarnings() + >>> filterwarnings('ignore', category=Warning) Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_poly` provides information on the number and type of operations -being performed in a kernel. To demonstrate this, we'll create an example kernel -that performs several operations on arrays containing different types of data: +:func:`loopy.get_op_map` provides information on the characteristics and +quantity of arithmetic operations being performed in a kernel. To demonstrate +this, we'll create an example kernel that performs several operations on arrays +containing different types of data: .. doctest:: @@ -1280,37 +1284,41 @@ information provided. Now we will count the operations: .. doctest:: - >>> from loopy.statistics import get_op_poly - >>> op_map = get_op_poly(knl) + >>> op_map = lp.get_op_map(knl) + >>> print(lp.stringify_stats_mapping(op_map)) + Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + -:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** -:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The -:class:`islpy.PwQPolynomial` holds the number of operations for the type specified -in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this -map now: +:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A +:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an +arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the +number of operations matching the characteristics of the :class:`loopy.Op` +specified in the key (in terms of the :class:`loopy.LoopKernel` +*inames*). :class:`loopy.Op` attributes include: -.. doctest:: +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. - >>> print(lp.stringify_stats_mapping(op_map)) - (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - +- name: A :class:`str` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. -We can evaluate these polynomials using :func:`islpy.eval_with_dict`: +One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict) - >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict) - >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict) - >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict) - >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict) - >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1320,174 +1328,238 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: 65536 65536 -Counting array accesses -~~~~~~~~~~~~~~~~~~~~~~~ +:class:`loopy.ToCountMap` provides member functions that facilitate filtering, +grouping, and evaluating subsets of the counts. Suppose we want to know the +total number of 32-bit operations of any kind. We can easily count these +using functions :func:`loopy.ToCountMap.filter_by` and +:func:`loopy.ToCountMap.eval_and_sum`: + +.. doctest:: -:func:`loopy.get_gmem_access_poly` provides information on the number and type of -array loads and stores being performed in a kernel. To demonstrate this, we'll -continue using the kernel from the previous example: + >>> filtered_op_map = op_map.filter_by(dtype=[np.float32]) + >>> f32op_count = filtered_op_map.eval_and_sum(param_dict) + >>> print(f32op_count) + 1572864 + +We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`, +which produces a :class:`loopy.ToCountMap` that contains the same counts grouped +together into keys containing only the specified fields: .. doctest:: - >>> from loopy.statistics import get_gmem_access_poly - >>> load_store_map = get_gmem_access_poly(knl) - >>> print(lp.stringify_stats_mapping(load_store_map)) - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + >>> op_map_dtype = op_map.group_by('dtype') + >>> print(lp.stringify_stats_mapping(op_map_dtype)) + Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) + ... ].eval_with_dict(param_dict) + >>> print(f32op_count) + 1572864 + +See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for +more information on these functions. + +Counting memory accesses +~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_mem_access_map` provides information on the number and +characteristics of memory accesses performed in a kernel. To demonstrate this, +we'll continue using the kernel from the previous example: + +.. doctest:: + + >>> mem_map = lp.get_mem_access_map(knl) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + + +:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. +:class:`loopy.MemAccess` attributes include: -:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** -:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** -**:** :class:`islpy.PwQPolynomial` **}**. +- mtype: A :class:`str` that specifies the memory type accessed as **global** + or **local** -- The :class:`numpy.dtype` specifies the type of the data being accessed. +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. -- The first string in the map key specifies the DRAM access type as *consecutive*, - *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when - consecutive threads access consecutive array elements in memory, *nonconsecutive* - accesses occur when consecutive threads access nonconsecutive array elements in - memory, and *uniform* accesses occur when consecutive threads access the *same* - element in memory. +- stride: An :class:`int` that specifies stride of the memory access. A stride + of 0 indicates a uniform access (i.e. all threads access the same item). -- The second string in the map key specifies the DRAM access type as a *load*, or a - *store*. +- direction: A :class:`str` that specifies the direction of memory access as + **load** or **store**. -- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the - characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` - *inames*). +- variable: A :class:`str` that specifies the variable name of the data + accessed. We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 + +:class:`loopy.ToCountMap` also makes it easy to determine the total amount +of data moved in bytes. Suppose we want to know the total abount of global +memory data loaded and stored. We can produce a map with just this information +using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: + +.. doctest:: + + >>> bytes_map = mem_map.to_bytes() + >>> print(lp.stringify_stats_mapping(bytes_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + + >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] + ... ).group_by('direction') + >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) + MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + + >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') + ... ].eval_with_dict(param_dict) + >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') + ... ].eval_with_dict(param_dict) + >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) + bytes loaded: 7340032 + bytes stored: 2621440 + +One can see how these functions might be useful in computing, for example, +achieved memory bandwidth in byte/sec or performance in FLOP/sec. ~~~~~~~~~~~ -Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers -the array accesses *uniform*. Now we'll parallelize the kernel and count the array -accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated -this time, so we'll print the mapping manually to make it more legible: +Since we have not tagged any of the inames or parallelized the kernel across +threads (which would have produced iname tags), :func:`loopy.get_mem_access_map` +considers the memory accesses *uniform*, so the *stride* of each access is 0. +Now we'll parallelize the kernel and count the array accesses again. The +resulting :class:`islpy.PwQPolynomial` will be more complicated this time. .. doctest:: - >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_gmem_access_poly(knl_consec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'consecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float32'), 'consecutive', 'store') : - [n, m, l] -> { ... } - - (dtype('float64'), 'consecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float64'), 'consecutive', 'store') : - [n, m, l] -> { ... } + >>> knl_consec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.1", inner_tag="l.0") + >>> mem_map = lp.get_mem_access_map(knl_consec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } - With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the -parallelization, but when we evaluate them, we see that the total number of array -accesses has not changed: +parallelization, but when we evaluate them, we see that the total number of +array accesses has not changed: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in -our parallelization of the kernel: +To produce *nonconsecutive* array accesses with stride greater than 1, we'll +switch the inner and outer tags in our parallelization of the kernel: .. doctest:: - >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_gmem_access_poly(knl_nonconsec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float32'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } - - (dtype('float64'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float64'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.0", inner_tag="l.1") + >>> mem_map = lp.get_mem_access_map(knl_nonconsec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } +With this parallelization, consecutive threads will access *nonconsecutive* +array elements in memory. The total number of array accesses still has not +changed: -With this parallelization, consecutive threads will access *nonconsecutive* array -elements in memory. The total number of array accesses has not changed: +.. doctest:: + + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 + +We can also filter using an arbitrary test function using +:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter +criteria are more complicated than a simple list of allowable values: .. doctest:: - >>> f64ld = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> def f(key): + ... from loopy.types import to_loopy_type + ... return key.dtype == to_loopy_type(np.float32) and \ + ... key.stride > 1 + >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict) + >>> print(count) + 2097152 Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_synchronization_poly` counts the number of synchronization +:func:`loopy.get_synchronization_map` counts the number of synchronization events per **thread** in a kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_synchronization_poly - >>> barrier_poly = get_synchronization_poly(knl) - >>> print(lp.stringify_stats_mapping(barrier_poly)) + >>> sync_map = lp.get_synchronization_map(knl) + >>> print(lp.stringify_stats_mapping(sync_map)) kernel_launch : { 1 } @@ -1495,7 +1567,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1537,24 +1609,24 @@ Now to make things more interesting, we'll create a kernel with barriers: } } - -In this kernel, when a thread performs the second instruction it uses data produced -by *different* threads during the first instruction. Because of this, barriers are -required for correct execution, so loopy inserts them. Now we'll count the barriers -using :func:`loopy.get_barrier_poly`: +In this kernel, when a thread performs the second instruction it uses data +produced by *different* threads during the first instruction. Because of this, +barriers are required for correct execution, so loopy inserts them. Now we'll +count the barriers using :func:`loopy.get_synchronization_map`: .. doctest:: - >>> sync_map = lp.get_synchronization_poly(knl) + >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) barrier_local : { 1000 } kernel_launch : { 1 } -Based on the kernel code printed above, we would expect each thread to encounter -50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In -this case, the number of barriers does not depend on any inames, so we can pass an -empty dictionary to :func:`islpy.eval_with_dict`. +Based on the kernel code printed above, we would expect each thread to +encounter 50x10x2 barriers, which matches the result from +:func:`loopy.get_synchronization_map`. In this case, the number of barriers +does not depend on any inames, so we can pass an empty dictionary to +:func:`islpy.eval_with_dict`. .. }}} diff --git a/loopy/__init__.py b/loopy/__init__.py index 200d871755fdf2c1cbf95db1c7d83c5b6b5441bc..110652cf75d467ceb473d4997142f4dabe3e763b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -112,10 +112,10 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.type_inference import infer_unknown_types from loopy.preprocess import preprocess_kernel, realize_reduction from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, - get_gmem_access_poly, - get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping, - sum_mem_access_to_bytes, +from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op, + MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, + get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, + get_synchronization_poly, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -221,10 +221,10 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", - "get_DRAM_access_poly", - "get_synchronization_poly", "stringify_stats_mapping", - "sum_mem_access_to_bytes", + "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess", + "get_op_poly", "get_op_map", "get_lmem_access_poly", + "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", + "get_synchronization_poly", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index a4662f8d7782bb9bbc2de263c8b8d02a649d9430..2ec5eb0d4d5e32dbd9eb201ab718078a6b36f7d8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -25,6 +25,7 @@ THE SOFTWARE. import six import loopy as lp +import numpy as np import warnings from islpy import dim_type import islpy as isl @@ -39,13 +40,13 @@ __doc__ = """ .. currentmodule:: loopy -.. autofunction:: get_op_poly +.. autoclass:: ToCountMap +.. autoclass:: Op +.. autoclass:: MemAccess -.. autofunction:: get_gmem_access_poly - -.. autofunction:: sum_mem_access_to_bytes - -.. autofunction:: get_synchronization_poly +.. autofunction:: get_op_map +.. autofunction:: get_mem_access_map +.. autofunction:: get_synchronization_map .. autofunction:: gather_access_footprints .. autofunction:: gather_access_footprint_bytes @@ -55,18 +56,27 @@ __doc__ = """ # {{{ ToCountMap -class ToCountMap: - """Maps any type of key to an arithmetic type.""" +class ToCountMap(object): + """Maps any type of key to an arithmetic type. + + .. automethod:: filter_by + .. automethod:: filter_by_func + .. automethod:: group_by + .. automethod:: to_bytes + .. automethod:: sum + .. automethod:: eval_and_sum + + """ def __init__(self, init_dict=None): if init_dict is None: init_dict = {} - self.dict = init_dict + self.count_map = init_dict def __add__(self, other): - result = self.dict.copy() - for k, v in six.iteritems(other.dict): - result[k] = self.dict.get(k, 0) + v + result = self.count_map.copy() + for k, v in six.iteritems(other.count_map): + result[k] = self.count_map.get(k, 0) + v return ToCountMap(result) def __radd__(self, other): @@ -80,8 +90,8 @@ class ToCountMap: def __mul__(self, other): if isinstance(other, isl.PwQPolynomial): return ToCountMap(dict( - (index, self.dict[index]*other) - for index in self.dict.keys())) + (index, self.count_map[index]*other) + for index in self.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {0} {1}." @@ -91,12 +101,262 @@ class ToCountMap: def __getitem__(self, index): try: - return self.dict[index] + return self.count_map[index] except KeyError: return isl.PwQPolynomial('{ 0 }') + def __setitem__(self, index, value): + self.count_map[index] = value + def __repr__(self): - return repr(self.dict) + return repr(self.count_map) + + def __len__(self): + return len(self.count_map) + + def items(self): + return self.count_map.items() + + def keys(self): + return self.count_map.keys() + + def pop(self, item): + return self.count_map.pop(item) + + def copy(self): + return ToCountMap(dict(self.count_map)) + + def filter_by(self, **kwargs): + """Remove items without specified key fields. + + :parameter \*\*kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. + + :return: A :class:`ToCountMap` containing the subset of the items in + the original :class:`ToCountMap` that match the field values + passed. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + from loopy.types import to_loopy_type + if 'dtype' in kwargs.keys(): + kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] + + # for each item in self.count_map + for self_key, self_val in self.items(): + try: + # check to see if key attribute values match all filters + for arg_field, allowable_vals in kwargs.items(): + attr_val = getattr(self_key, arg_field) + # see if the value is in the filter list + if attr_val not in allowable_vals: + break + else: # loop terminated without break or error + result_map[self_key] = self_val + except(AttributeError): + # the field passed is not a field of this key + continue + + return result_map + + def filter_by_func(self, func): + """Keep items that pass a test. + + :parameter func: A function that takes a map key a parameter and + returns a :class:`bool`. + + :return: A :class:`ToCountMap` containing the subset of the items in + the original :class:`ToCountMap` for which func(key) is true. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + def filter_func(key): + return key.stride > 1 and key.stride <= 4: + + filtered_map = mem_map.filter_by_func(filter_func) + tot = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # for each item in self.count_map, call func on the key + for self_key, self_val in self.items(): + if func(self_key): + result_map[self_key] = self_val + + return result_map + + def group_by(self, *args): + """Group map items together, distinguishing by only the key fields + passed in args. + + :parameter \*args: Zero or more :class:`str` fields of map keys. + + :return: A :class:`ToCountMap` containing the same total counts + grouped together by new keys that only contain the fields + specified in the arguments passed. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_map(knl) + grouped_map = mem_map.group_by('mtype', 'dtype', 'direction') + + f32_global_ld = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_global_st = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + f32_local_ld = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_local_st = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + + op_map = get_op_map(knl) + ops_dtype = op_map.group_by('dtype') + + f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params) + f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) + i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # make sure all item keys have same type + if self.count_map: + key_type = type(list(self.keys())[0]) + if not all(isinstance(x, key_type) for x in self.keys()): + raise ValueError("ToCountMap: group_by() function may only " + "be used on ToCountMaps with uniform keys") + else: + return result_map + + # for each item in self.count_map + for self_key, self_val in self.items(): + new_key = key_type() + + # set all specified fields + for field in args: + setattr(new_key, field, getattr(self_key, field)) + + if new_key in result_map.keys(): + result_map[new_key] += self_val + else: + result_map[new_key] = self_val + + return result_map + + def to_bytes(self): + """Convert counts to bytes using data type in map key. + + :return: A :class:`ToCountMap` mapping each original key to a + :class:`islpy.PwQPolynomial` with counts in bytes rather than + instances. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + bytes_map = get_mem_access_map(knl).to_bytes() + params = {'n': 512, 'm': 256, 'l': 128} + + s1_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['load']).eval_and_sum(params) + s2_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['load']).eval_and_sum(params) + s1_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['store']).eval_and_sum(params) + s2_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['store']).eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result = self.copy() + + for key, val in self.items(): + bytes_processed = int(key.dtype.itemsize) * val + result[key] = bytes_processed + + return result + + + def sum(self): + """Add all counts in ToCountMap. + + :return: A :class:`islpy.PwQPolynomial` containing the sum of counts. + + """ + total = isl.PwQPolynomial('{ 0 }') + for k, v in self.items(): + if not isinstance(v, isl.PwQPolynomial): + raise ValueError("ToCountMap: sum() encountered type {0} but " + "may only be used on PwQPolynomials." + .format(type(v))) + total += v + return total + + + def eval_and_sum(self, params): + """Add all counts in :class:`ToCountMap` and evaluate with provided + parameter dict. + + :return: An :class:`int` containing the sum of all counts in the + :class:`ToCountMap` evaluated with the parameters provided. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + return self.sum().eval_with_dict(params) # }}} @@ -108,6 +368,143 @@ def stringify_stats_mapping(m): return result +class Op(object): + """An arithmetic operation. + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. + + .. attribute:: name + + A :class:`str` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + + """ + + def __init__(self, dtype=None, name=None): + self.name = name + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) + + def __eq__(self, other): + return isinstance(other, Op) and ( + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.name is None or other.name is None or + self.name == other.name)) + + def __hash__(self): + return hash(str(self)) + + def __str__(self): + if self.dtype is None: + dtype = 'None' + else: + dtype = str(self.dtype) + if self.name is None: + name = 'None' + else: + name = self.name + return "Op("+dtype+", "+name+")" + + +class MemAccess(object): + """A memory access. + + .. attribute:: mtype + + A :class:`str` that specifies the memory type accessed as **global** + or **local** + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. + + .. attribute:: stride + + An :class:`int` that specifies stride of the memory access. A stride of 0 + indicates a uniform access (i.e. all threads access the same item). + + .. attribute:: direction + + A :class:`str` that specifies the direction of memory access as + **load** or **store**. + + .. attribute:: variable + + A :class:`str` that specifies the variable name of the data + accessed. + + """ + + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): + self.mtype = mtype + self.stride = stride + self.direction = direction + self.variable = variable + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) + + #TODO currently giving all lmem access stride=None + if (mtype == 'local') and (stride is not None): + raise NotImplementedError("MemAccess: stride must be None when " + "mtype is 'local'") + + #TODO currently giving all lmem access variable=None + if (mtype == 'local') and (variable is not None): + raise NotImplementedError("MemAccess: variable must be None when " + "mtype is 'local'") + + def __eq__(self, other): + return isinstance(other, MemAccess) and ( + (self.mtype is None or other.mtype is None or + self.mtype == other.mtype) and + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.stride is None or other.stride is None or + self.stride == other.stride) and + (self.direction is None or other.direction is None or + self.direction == other.direction) and + (self.variable is None or other.variable is None or + self.variable == other.variable)) + + def __hash__(self): + return hash(str(self)) + + def __str__(self): + if self.mtype is None: + mtype = 'None' + else: + mtype = self.mtype + if self.dtype is None: + dtype = 'None' + else: + dtype = str(self.dtype) + if self.stride is None: + stride = 'None' + else: + stride = str(self.stride) + if self.direction is None: + direction = 'None' + else: + direction = self.direction + if self.variable is None: + variable = 'None' + else: + variable = self.variable + return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \ + +variable+")" + + # {{{ ExpressionOpCounter class ExpressionOpCounter(CombineMapper): @@ -126,41 +523,33 @@ class ExpressionOpCounter(CombineMapper): map_tagged_variable = map_constant map_variable = map_constant - #def map_wildcard(self, expr): - # return 0,0 - - #def map_function_symbol(self, expr): - # return 0,0 - def map_call(self, expr): return ToCountMap( - {(self.type_inf(expr), 'func:'+str(expr.function)): 1} + {Op(dtype=self.type_inf(expr), + name='func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) - # def map_call_with_kwargs(self, expr): # implemented in CombineMapper - - def map_subscript(self, expr): # implemented in CombineMapper + def map_subscript(self, expr): return self.rec(expr.index) - # def map_lookup(self, expr): # implemented in CombineMapper - def map_sum(self, expr): assert expr.children return ToCountMap( - {(self.type_inf(expr), 'add'): len(expr.children)-1} + {Op(dtype=self.type_inf(expr), + name='add'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1}) + return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({(self.type_inf(expr), 'mul'): -1}) + ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({(self.type_inf(expr), 'div'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -168,54 +557,47 @@ class ExpressionOpCounter(CombineMapper): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap( - {(self.type_inf(expr), 'bw'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): + len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or map_bitwise_and = map_bitwise_or - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_logical_not(self, expr): - return self.rec(expr.child) - - def map_logical_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_logical_and = map_logical_or - def map_if(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_if_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_ifpos_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap( - {(self.type_inf(expr), 'maxmin'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'): + len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_max = map_min @@ -225,11 +607,13 @@ class ExpressionOpCounter(CombineMapper): "map_common_subexpression not implemented.") def map_substitution(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered substitution, " + raise NotImplementedError("ExpressionOpCounter encountered " + "substitution, " "map_substitution not implemented.") def map_derivative(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered derivative, " + raise NotImplementedError("ExpressionOpCounter encountered " + "derivative, " "map_derivative not implemented.") def map_slice(self, expr): @@ -239,6 +623,83 @@ class ExpressionOpCounter(CombineMapper): # }}} +# {{{ LocalSubscriptCounter + +class LocalSubscriptCounter(CombineMapper): + + def __init__(self, knl): + self.knl = knl + from loopy.type_inference import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + map_tagged_variable = map_constant + map_variable = map_constant + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_subscript(self, expr): + sub_map = ToCountMap() + name = expr.aggregate.name # name of array + if name in self.knl.temporary_variables: + array = self.knl.temporary_variables[name] + if array.is_local: + sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 + return sub_map + self.rec(expr.index) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + def map_if(self, expr): + warn_with_kernel(self.knl, "summing_if_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_if_positive(self, expr): + warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_common_subexpression(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "common_subexpression, " + "map_common_subexpression not implemented.") + + def map_substitution(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "substitution, " + "map_substitution not implemented.") + + def map_derivative(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "derivative, " + "map_derivative not implemented.") + + def map_slice(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered slice, " + "map_slice not implemented.") + +# }}} + + # {{{ GlobalSubscriptCounter class GlobalSubscriptCounter(CombineMapper): @@ -278,33 +739,52 @@ class GlobalSubscriptCounter(CombineMapper): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, GroupIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - local_id0 = None + + # find min tag axis + import sys + min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: - # find local id0 tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - if tag.axis == 0: - local_id0 = iname - break # there will be only one local_id0 + if tag.axis < min_tag_axis: + min_tag_axis = tag.axis if not local_id_found: # count as uniform access - return ToCountMap( - {(self.type_inf(expr), 'uniform'): 1} - ) + self.rec(expr.index) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} + ) + self.rec(expr.index) + + if min_tag_axis != 0: + warn_with_kernel(self.knl, "unknown_gmem_stride", + "GlobalSubscriptCounter: Memory access minimum " + "tag axis %d != 0, stride unknown, using " + "sys.maxsize." % (min_tag_axis)) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), + stride=sys.maxsize, variable=name): 1} + ) + self.rec(expr.index) + + # get local_id associated with minimum tag axis + min_lid = None + for iname in my_inames: + tag = self.knl.iname_to_tag.get(iname) + if isinstance(tag, LocalIndexTag): + if tag.axis == min_tag_axis: + min_lid = iname + break # there will be only one min local_id - if local_id0 is None: - # only non-zero local id(s) found, assume non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + # found local_id associated with minimum tag axis - # check coefficient of local_id0 for each axis + total_stride = 0 + # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector + from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): @@ -312,36 +792,22 @@ class GlobalSubscriptCounter(CombineMapper): coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx)) # check if he contains the lid 0 guy try: - coeff_id0 = coeffs[Variable(local_id0)] + coeff_min_lid = coeffs[Variable(min_lid)] except KeyError: - # does not contain local_id0 + # does not contain min_lid continue - - if coeff_id0 != 1: - # non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # coefficient is 1, now determine if stride is 1 - from loopy.kernel.array import FixedStrideArrayDimTag + # found coefficient of min_lid + # now determine stride if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue - if stride != 1: - # non-consecutive - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + total_stride += stride*coeff_min_lid - # else, stride == 1, continue since another idx could contain id0 - - # loop finished without returning, stride==1 for every instance of local_id0 - return ToCountMap( - {(self.type_inf(expr), 'consecutive'): 1} - ) + self.rec(expr.index) + return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), + stride=total_stride, variable=name): 1} + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -351,48 +817,19 @@ class GlobalSubscriptCounter(CombineMapper): map_product = map_sum - def map_quotient(self, expr, *args): - return self.rec(expr.numerator) + self.rec(expr.denominator) - - map_floor_div = map_quotient - map_remainder = map_quotient - - def map_power(self, expr): - return self.rec(expr.base) + self.rec(expr.exponent) - - def map_left_shift(self, expr): - return self.rec(expr.shiftee)+self.rec(expr.shift) - - map_right_shift = map_left_shift - - def map_bitwise_not(self, expr): - return self.rec(expr.child) - - def map_bitwise_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_bitwise_xor = map_bitwise_or - map_bitwise_and = map_bitwise_or - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - map_logical_not = map_bitwise_not - map_logical_or = map_bitwise_or - map_logical_and = map_logical_or - def map_if(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_if_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) - - map_min = map_bitwise_or - map_max = map_min + warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_common_subexpression(self, expr): raise NotImplementedError("GlobalSubscriptCounter encountered " @@ -524,7 +961,8 @@ def count(kernel, set): # {{{ rebuild check domain - zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space)) + zero = isl.Aff.zero_on_domain( + isl.LocalSpace.from_space(bset.space)) iname = isl.PwAff.from_aff( zero.set_coefficient_val(isl.dim_type.in_, i, 1)) dmin_matched = dmin.insert_dims( @@ -584,31 +1022,44 @@ def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. + get_op_poly is deprecated. Use get_op_map instead. + + """ + warn_with_kernel(knl, "depricated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") + return get_op_map(knl, numpy_types) + +# }}} + + +def get_op_map(knl, numpy_types=True): + + """Count the number of operations in a loopy kernel. + :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :return: A mapping of **{(** *type* **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}**. + :parameter numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** + :class:`islpy.PwQPolynomial` **}**. - - The string specifies the operation type as - *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` *parameter inames*). + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: # (first create loopy kernel and specify array data types) - poly = get_op_poly(knl) + op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -618,88 +1069,128 @@ def get_op_poly(knl, numpy_types=True): knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - op_poly = ToCountMap() + op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_poly = op_poly + ops*count(knl, domain) - result = op_poly.dict + op_map = op_map + ops*count(knl, domain) if numpy_types: - result = dict( - ((dtype.numpy_dtype, kind), count) - for (dtype, kind), count in six.iteritems(result)) + op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + count) + for op, count in six.iteritems(op_map.count_map)) - return result -# }}} + return op_map -def sum_ops_to_dtypes(op_poly_dict): - result = {} - for (dtype, kind), v in op_poly_dict.items(): - new_key = dtype - if new_key in result: - result[new_key] += v - else: - result[new_key] = v +#TODO test deprecated functions? +def get_lmem_access_poly(knl): + """Count the number of local memory accesses in a loopy kernel. - return result + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + + """ + warn_with_kernel(knl, "depricated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") + return get_mem_access_map(knl).filter_by(mtype=['local']) + + +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "depricated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) # {{{ get_gmem_access_poly -def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts +def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be - counted. + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. - :return: A mapping of **{(** *type* **,** :class:`string` **,** - :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. + """ + warn_with_kernel(knl, "depricated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + +# }}} - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. - - The first string in the map key specifies the global memory - access type as - *consecutive*, *nonconsecutive*, or *uniform*. +def get_mem_access_map(knl, numpy_types=True): + """Count the number of memory accesses in a loopy kernel. + + :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. - - The second string in the map key specifies the global memory - access type as a - *load*, or a *store*. + :parameter numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`MemAccess` specifies the characteristics of the + memory access. + + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) - subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - - f32_uncoalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_store = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict(params) + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='load', + variable='a') + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='store', + variable='a') + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='load', + variable='x') + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='store', + variable='x') + ].eval_with_dict(params) # (now use these counts to predict performance) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): @@ -712,7 +1203,8 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + isinstance( + knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) @@ -721,82 +1213,82 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_poly = ToCountMap() - subscript_counter = GlobalSubscriptCounter(knl) + subs_map = ToCountMap() + subs_counter_g = GlobalSubscriptCounter(knl) + subs_counter_l = LocalSubscriptCounter(knl) + for insn in knl.instructions: - # count subscripts, distinguishing loads and stores - subs_expr = subscript_counter(insn.expression) - subs_expr = ToCountMap(dict( - (key + ("load",), val) - for key, val in six.iteritems(subs_expr.dict))) - subs_assignee = subscript_counter(insn.assignee) - subs_assignee = ToCountMap(dict( - (key + ("store",), val) - for key, val in six.iteritems(subs_assignee.dict))) + # count subscripts + subs_expr = subs_counter_g(insn.expression) \ + + subs_counter_l(insn.expression) + + # distinguish loads and stores + for key in subs_expr.count_map: + subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, direction='load', + variable=key.variable) + ] = subs_expr.pop(key) + + subs_assignee_g = subs_counter_g(insn.assignee) + for key in subs_assignee_g.count_map: + subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, + direction='store', + variable=key.variable) + ] = subs_assignee_g.pop(key) + # for now, don't count writes to local mem insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses - for key in subs_expr.dict: - poly = ToCountMap({key: subs_expr.dict[key]}) - if key[1] == "uniform": - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + for key in subs_expr.count_map: + map = ToCountMap({key: subs_expr[key]}) + if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - for key in subs_assignee.dict: - poly = ToCountMap({key: subs_assignee.dict[key]}) - if key[1] == "uniform": - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + subs_map = subs_map + map*get_insn_count(knl, insn_inames) + #currently not counting stride of local mem access + + for key in subs_assignee_g.count_map: + map = ToCountMap({key: subs_assignee_g[key]}) + if isinstance(key.stride, int) and key.stride == 0: + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - - result = subs_poly.dict + subs_map = subs_map + map*get_insn_count(knl, insn_inames) + # for now, don't count writes to local mem if numpy_types: - result = dict( - ((dtype.numpy_dtype, kind, direction), count) - for (dtype, kind, direction), count in six.iteritems(result)) - - return result + subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable) + , count) + for mem_access, count in six.iteritems(subs_map.count_map)) - -def get_DRAM_access_poly(knl): - from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", - DeprecationWarning, stacklevel=2) - return get_gmem_access_poly(knl) - -# }}} + return subs_map -# {{{ sum_mem_access_to_bytes +# {{{ get_synchronization_poly -def sum_mem_access_to_bytes(m): - """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping +def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. - **{(** :class:`string` **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}** + get_synchronization_poly is deprecated. Use get_synchronization_map instead. - i.e., aggregate the transfer numbers for all types into a single byte count. """ - - result = {} - for (dtype, kind, direction), v in m.items(): - new_key = (kind, direction) - bytes_transferred = int(dtype.itemsize) * v - if new_key in result: - result[new_key] += bytes_transferred - else: - result[new_key] = bytes_transferred - - return result + warn_with_kernel(knl, "depricated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") + return get_synchronization_map(knl) # }}} -# {{{ get_synchronization_poly - -def get_synchronization_poly(knl): +def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. @@ -804,8 +1296,8 @@ def get_synchronization_poly(knl): :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a - :class:`islpy.PwQPolynomial` holding the number of such events - per thread. + :class:`islpy.PwQPolynomial` holding the number of events per + thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -814,9 +1306,9 @@ def get_synchronization_poly(knl): # (first create loopy kernel and specify array data types) - barrier_poly = get_barrier_poly(knl) + sync_map = get_synchronization_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - barrier_count = barrier_poly.eval_with_dict(params) + barrier_ct = sync_map['barrier_local'].eval_with_dict(params) # (now use this count to predict performance) @@ -854,8 +1346,8 @@ def get_synchronization_poly(knl): iname_list.pop() elif isinstance(sched_item, Barrier): - result = result + ToCountMap( - {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) + result = result + ToCountMap({"barrier_%s" % sched_item.kind: + get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( @@ -868,9 +1360,8 @@ def get_synchronization_poly(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - return result.dict - -# }}} + #return result.count_map #TODO is this change okay? + return result # {{{ gather_access_footprints @@ -881,7 +1372,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. - :arg ignore_uncountable: If *True*, an error will be raised for + :arg ignore_uncountable: If *False*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ @@ -905,7 +1396,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False): insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except(insn_inames, + [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) @@ -947,7 +1439,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): kernel = preprocess_kernel(kernel) result = {} - fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) + fp = gather_access_footprints(kernel, + ignore_uncountable=ignore_uncountable) for key, var_fp in fp.items(): vname, direction = key diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 0c304b7a854579007f57ba204cbff8f440aaf5fc..c85aa80ec92eb0185d30f96b478ae37043c0d7e0 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): if 1: print("OPS") - op_poly = lp.get_op_poly(hsv) - print(lp.stringify_stats_mapping(op_poly)) + op_map = lp.get_op_map(hsv) + print(lp.stringify_stats_mapping(op_map)) print("MEM") - gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) - print(lp.stringify_stats_mapping(gmem_poly)) + gmem_map = lp.get_mem_access_map(hsv).to_bytes() + print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", diff --git a/test/test_statistics.py b/test/test_statistics.py index 68be5b8a260858e058619c796b3836611c8d4f0f..fb502045c7b6b2c7e02d11ad3ebda3b5d13c8bda 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,8 +28,10 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp +from loopy.types import to_loopy_type import numpy as np +from pymbolic.primitives import Variable def test_op_counter_basic(): @@ -44,21 +46,22 @@ def test_op_counter_basic(): name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 - + def test_op_counter_reduction(): @@ -70,15 +73,19 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l + op_map_dtype = op_map.group_by('dtype') + f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + assert f32 == f32add + f32mul + def test_op_counter_logic(): @@ -92,15 +99,15 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -120,24 +127,25 @@ def test_op_counter_specialops(): name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m - assert f64pow == i32add == f64rsqrt == f64sin == n*m + assert f64pow == i32add == f64rsq == f64sin == n*m def test_op_counter_bitwise(): @@ -157,17 +165,17 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) - i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -196,9 +204,9 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')] + op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) - flops = poly.eval_with_dict(value_dict) + flops = op_map.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 @@ -206,7 +214,7 @@ def test_op_counter_triangular_domain(): assert flops == 78 -def test_gmem_access_counter_basic(): +def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i 1: exec(sys.argv[1])