diff --git a/doc/misc.rst b/doc/misc.rst index 97bac9fec35d1960f0b8dceb9489f8399b72520c..347b5d098c8dc0e37bb72659c0b0de5a8b4e3704 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -101,6 +101,10 @@ In the meantime, you can generate code simply by saying:: print(cg_result.host_code()) print(cg_result.device_code()) +Additionally, for C-based languages, header defintions are available via:: + + loopy.generate_header(knl) + For what types of codes does :mod:`loopy` work well? ---------------------------------------------------- diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 3a15e3a585afc9a9b181ae21b202e20a104ad2a1..97d71f3e04051d45a2f911eb0f7b2eca7147b96b 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -242,6 +242,12 @@ These are usually key-value pairs. The following attributes are recognized: heuristic and indicate that the specified list of dependencies is exhaustive. +* ``dep_query=...`` provides an alternative way of specifying instruction + dependencies. The given string is parsed as a match expression object by + :func:`loopy.match.parse_match`. Upon kernel generation, this match + expression is used to match instructions in the kernel and add them as + dependencies. + * ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary the instructions with identifiers ``id1`` and ``id2`` to the, even if a dependency chain exists and variables are accessed in an apparently @@ -251,6 +257,9 @@ These are usually key-value pairs. The following attributes are recognized: function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with ``id_prefix``. +* ``nosync_query=...`` provides an alternative way of specifying ``nosync``, + just like ``dep_query`` and ``dep``. + * ``priority=integer`` sets the instructions priority to the value ``integer``. Instructions with higher priority will be scheduled sooner, if possible. Note that the scheduler may still schedule a lower-priority diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index f16f8bfdbb26b716af27762d8502bff592496d7c..4a07b63330747aa69d7ed498e004d60b7c312a7b 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -114,11 +114,15 @@ Finishing up .. autofunction:: get_one_scheduled_kernel +.. autofunction:: save_and_reload_temporaries + .. autoclass:: GeneratedProgram .. autoclass:: CodeGenerationResult .. autofunction:: generate_code_v2 +.. autofunction:: generate_header + Setting options --------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index fa6fcc95088198c28f17b2e383a54eb961419467..7c8ba2fc975265a7a76864b0de060ec58e492217 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -53,6 +53,13 @@ And some data on the host: .. }}} +We'll also disable console syntax highlighting because it confuses +doctest:: + + >>> # not a documented interface + >>> import loopy.options + >>> loopy.options.ALLOW_TERMINAL_COLORS = False + Getting started --------------- @@ -256,6 +263,14 @@ call :func:`loopy.generate_code`: out[i] = 2.0f * a[i]; } +Additionally, for C-based languages, header definitions can be obtained via +the :func:`loopy.generate_header`: + +.. doctest:: + >>> header = str(lp.generate_header(typed_knl)[0]) + >>> print(header) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out); + .. }}} .. _ordering: @@ -532,9 +547,8 @@ Consider this example: #define lid(N) ((int) get_local_id(N)) ... for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner) + a[16 * i_outer + i_inner] = 0.0f; ... By default, the new, split inames are named *OLD_outer* and *OLD_inner*, @@ -563,10 +577,9 @@ relation to loop nesting. For example, it's perfectly possible to request >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -791,7 +804,9 @@ enabling some cost savings: a[4 * i_outer + 3] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) + { + int const i_outer = -1 + n + -1 * (3 * n / 4); + <BLANKLINE> if (-1 + n >= 0) { a[4 * i_outer] = 0.0f; @@ -802,6 +817,7 @@ enabling some cost savings: if (4 + 4 * i_outer + -1 * n == 0) a[4 * i_outer + 3] = 0.0f; } + } ... .. }}} @@ -906,6 +922,8 @@ expression being assigned. ... """) >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) +.. _local_temporaries: + Temporaries in local memory ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1048,30 +1066,156 @@ Generic Precomputation .. }}} -.. _more-complicated-programs: -More complicated programs -------------------------- +.. _synchronization: + +Synchronization +--------------- .. {{{ -SCOP +In OpenCL, writes are not generally guaranteed to be immediately visible to +other work items. In order to ensure that memory is consistent across work +items, some sort of synchronization operation is used. -Data-dependent control flow -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:mod:`loopy` supports synchronization in the form of *barriers* or *atomic +operations*. -Conditionals -~~~~~~~~~~~~ +Barriers +~~~~~~~~ -Snippets of C -~~~~~~~~~~~~~ +Prior to code generation, :mod:`loopy` performs a check to see that every memory +access is free of dependencies requiring a barrier. A memory access dependency +that exists across multiple work items requires a barrier if it involves at +least one write operation. + +:mod:`loopy` supports two kinds of barriers: + +* *Local barriers* ensure consistency of local memory accesses to items within + *the same* work group. As in OpenCL, all work items in the group are required + to wait until everyone has reached the barrier instruction before continuing. + +* *Global barriers* ensure consistency of *global* memory accesses across *all* + work groups. Note that there is no exact equivalent in OpenCL. All work items + across all work groups are required to wait until everyone has reached the + barrier instruction before continuing. + +By default, :mod:`loopy` inserts local barriers between two instructions when it +detects that a dependency involving local memory may occur across work items. To +see this in action, take a look at the section on :ref:`local_temporaries`. + +In contrast, :mod:`loopy` will *not* insert global barriers +automatically. Consider the following kernel, which attempts to rotate its input +to the right by 1: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=i<n}", + ... """ + ... for i + ... <>tmp = arr[i] {id=maketmp,dep=*} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v1", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + Traceback (most recent call last): + ... + MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + +Because of the write-after-read dependency in global memory, a global barrier +needs to be inserted. This can be accomplished with a ``... gbarrier`` +instruction. Note that :mod:`loopy` implements global barriers by splitting the +kernel into multiple device-side kernels, so that the resulting code will +contain more than one kernel. + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=i<n}", + ... """ + ... for i + ... <>tmp = arr[i] {id=maketmp,dep=*} + ... ... gbarrier {id=bar,dep=*maketmp} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*bar} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v2", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n) + { + int tmp; + <BLANKLINE> + tmp = arr[16 * gid(0) + lid(0)]; + } + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n) + { + int tmp; + <BLANKLINE> + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + +Note that we are not done yet. The problem is that while `tmp` is assigned in +the first kernel, the assignment of `tmp` is not saved for the second +kernel. :mod:`loopy` provides a function called +:func:`loopy.save_and_reload_temporaries` for the purpose of handling the +situation of saving and restoring temporary values across global barriers. In +order to use this function the kernel must be preprocessed and scheduled first, +the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`. + +.. doctest:: + + >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.save_and_reload_temporaries(knl) + >>> knl = lp.get_one_scheduled_kernel(knl) + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + <BLANKLINE> + tmp = arr[16 * gid(0) + lid(0)]; + tmp_save_slot[16 * gid(0) + lid(0)] = tmp; + } + <BLANKLINE> + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + <BLANKLINE> + tmp = tmp_save_slot[16 * gid(0) + lid(0)]; + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True) + >>> print(out) + [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] Atomic operations ~~~~~~~~~~~~~~~~~ -Loopy supports atomic operations. To use them, both the data on which the -atomic operations work as well as the operations themselves must be -suitably tagged, as in the following example:: +:mod:`loopy` supports atomic operations. To use them, both the data on which the +atomic operations work as well as the operations themselves must be suitably +tagged, as in the following example:: knl = lp.make_kernel( @@ -1086,6 +1230,49 @@ suitably tagged, as in the following example:: .. }}} +.. _more-complicated-programs: + +More complicated programs +------------------------- + +.. {{{ + +SCOP + +External Functions +~~~~~~~~~~~~~~~~~~ + +Loopy currently supports calls to several commonly used mathematical functions, +e.g. exp/log, min/max, sin/cos/tan, sinh/cosh, abs, etc. They may be used in +a loopy kernel by simply calling them, e.g.:: + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + for i + a[i] = sqrt(i) + end + """) + +Additionally, all functions of one variable are currently recognized during +code-generation however additional implementation may be required for custom +functions. The full lists of available functions may be found in a the +:class:`TargetBase` implementation (e.g. :class:`CudaTarget`) + +Custom user functions may be represented using the method described in :ref:`_functions` + + +Data-dependent control flow +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Conditionals +~~~~~~~~~~~~ + +Snippets of C +~~~~~~~~~~~~~ + +.. }}} + Common Problems --------------- @@ -1213,26 +1400,30 @@ Obtaining Performance Statistics .. {{{ -Operations, array access, and barriers can all be counted, which may facilitate -performance prediction and optimization of a :mod:`loopy` kernel. +Arithmetic operations, array accesses, and synchronization operations can all +be counted, which may facilitate performance prediction and optimization of a +:mod:`loopy` kernel. .. note:: The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples - above, you may need to reset these before continuing: + above, you may want to reset these before continuing. We will temporarily + supress warnings to keep the output clean: .. doctest:: - >>> from warnings import resetwarnings + >>> from warnings import resetwarnings, filterwarnings >>> resetwarnings() + >>> filterwarnings('ignore', category=Warning) Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_poly` provides information on the number and type of operations -being performed in a kernel. To demonstrate this, we'll create an example kernel -that performs several operations on arrays containing different types of data: +:func:`loopy.get_op_map` provides information on the characteristics and +quantity of arithmetic operations being performed in a kernel. To demonstrate +this, we'll create an example kernel that performs several operations on arrays +containing different types of data: .. doctest:: @@ -1250,38 +1441,42 @@ information provided. Now we will count the operations: .. doctest:: - >>> from loopy.statistics import get_op_poly - >>> op_map = get_op_poly(knl) + >>> op_map = lp.get_op_map(knl) + >>> print(lp.stringify_stats_mapping(op_map)) + Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + <BLANKLINE> -:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** -:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The -:class:`islpy.PwQPolynomial` holds the number of operations for the type specified -in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this -map now: +:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A +:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an +arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the +number of operations matching the characteristics of the :class:`loopy.Op` +specified in the key (in terms of the :class:`loopy.LoopKernel` +*inames*). :class:`loopy.Op` attributes include: -.. doctest:: +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. - >>> print(lp.stringify_stats_mapping(op_map)) - (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - <BLANKLINE> +- name: A :class:`str` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. -We can evaluate these polynomials using :func:`islpy.eval_with_dict`: +One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict) - >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict) - >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict) - >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict) - >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict) - >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict) - >>> print("%i\n%i\n%i\n%i\n%i\n%i" % + >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) + >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 524288 @@ -1290,174 +1485,238 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: 65536 65536 -Counting array accesses -~~~~~~~~~~~~~~~~~~~~~~~ +:class:`loopy.ToCountMap` provides member functions that facilitate filtering, +grouping, and evaluating subsets of the counts. Suppose we want to know the +total number of 32-bit operations of any kind. We can easily count these +using functions :func:`loopy.ToCountMap.filter_by` and +:func:`loopy.ToCountMap.eval_and_sum`: + +.. doctest:: + + >>> filtered_op_map = op_map.filter_by(dtype=[np.float32]) + >>> f32op_count = filtered_op_map.eval_and_sum(param_dict) + >>> print(f32op_count) + 1572864 -:func:`loopy.get_gmem_access_poly` provides information on the number and type of -array loads and stores being performed in a kernel. To demonstrate this, we'll -continue using the kernel from the previous example: +We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`, +which produces a :class:`loopy.ToCountMap` that contains the same counts grouped +together into keys containing only the specified fields: .. doctest:: - >>> from loopy.statistics import get_gmem_access_poly - >>> load_store_map = get_gmem_access_poly(knl) - >>> print(lp.stringify_stats_mapping(load_store_map)) - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + >>> op_map_dtype = op_map.group_by('dtype') + >>> print(lp.stringify_stats_mapping(op_map_dtype)) + Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } <BLANKLINE> + >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) + ... ].eval_with_dict(param_dict) + >>> print(f32op_count) + 1572864 -:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** -:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** -**:** :class:`islpy.PwQPolynomial` **}**. +See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for +more information on these functions. + +Counting memory accesses +~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`loopy.get_mem_access_map` provides information on the number and +characteristics of memory accesses performed in a kernel. To demonstrate this, +we'll continue using the kernel from the previous example: + +.. doctest:: + + >>> mem_map = lp.get_mem_access_map(knl) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + <BLANKLINE> -- The :class:`numpy.dtype` specifies the type of the data being accessed. +:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. +:class:`loopy.MemAccess` attributes include: -- The first string in the map key specifies the DRAM access type as *consecutive*, - *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when - consecutive threads access consecutive array elements in memory, *nonconsecutive* - accesses occur when consecutive threads access nonconsecutive array elements in - memory, and *uniform* accesses occur when consecutive threads access the *same* - element in memory. +- mtype: A :class:`str` that specifies the memory type accessed as **global** + or **local** -- The second string in the map key specifies the DRAM access type as a *load*, or a - *store*. +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. -- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the - characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` - *inames*). +- stride: An :class:`int` that specifies stride of the memory access. A stride + of 0 indicates a uniform access (i.e. all threads access the same item). + +- direction: A :class:`str` that specifies the direction of memory access as + **load** or **store**. + +- variable: A :class:`str` that specifies the variable name of the data + accessed. We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 + +:class:`loopy.ToCountMap` also makes it easy to determine the total amount +of data moved in bytes. Suppose we want to know the total abount of global +memory data loaded and stored. We can produce a map with just this information +using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: + +.. doctest:: + + >>> bytes_map = mem_map.to_bytes() + >>> print(lp.stringify_stats_mapping(bytes_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + <BLANKLINE> + >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] + ... ).group_by('direction') + >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) + MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + <BLANKLINE> + >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') + ... ].eval_with_dict(param_dict) + >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') + ... ].eval_with_dict(param_dict) + >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) + bytes loaded: 7340032 + bytes stored: 2621440 + +One can see how these functions might be useful in computing, for example, +achieved memory bandwidth in byte/sec or performance in FLOP/sec. ~~~~~~~~~~~ -Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers -the array accesses *uniform*. Now we'll parallelize the kernel and count the array -accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated -this time, so we'll print the mapping manually to make it more legible: +Since we have not tagged any of the inames or parallelized the kernel across +threads (which would have produced iname tags), :func:`loopy.get_mem_access_map` +considers the memory accesses *uniform*, so the *stride* of each access is 0. +Now we'll parallelize the kernel and count the array accesses again. The +resulting :class:`islpy.PwQPolynomial` will be more complicated this time. .. doctest:: - >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_gmem_access_poly(knl_consec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'consecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float32'), 'consecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float64'), 'consecutive', 'load') : - [n, m, l] -> { ... } + >>> knl_consec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.1", inner_tag="l.0") + >>> mem_map = lp.get_mem_access_map(knl_consec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } <BLANKLINE> - (dtype('float64'), 'consecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the -parallelization, but when we evaluate them, we see that the total number of array -accesses has not changed: - -.. doctest:: - - >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 +parallelization, but when we evaluate them, we see that the total number of +array accesses has not changed: + +.. doctest:: + + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in -our parallelization of the kernel: +To produce *nonconsecutive* array accesses with stride greater than 1, we'll +switch the inner and outer tags in our parallelization of the kernel: .. doctest:: - >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_gmem_access_poly(knl_nonconsec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float32'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float64'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - <BLANKLINE> - (dtype('float64'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.0", inner_tag="l.1") + >>> mem_map = lp.get_mem_access_map(knl_nonconsec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } <BLANKLINE> +With this parallelization, consecutive threads will access *nonconsecutive* +array elements in memory. The total number of array accesses still has not +changed: + +.. doctest:: -With this parallelization, consecutive threads will access *nonconsecutive* array -elements in memory. The total number of array accesses has not changed: + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') + ... ].eval_with_dict(param_dict) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') + ... ].eval_with_dict(param_dict) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') + ... ].eval_with_dict(param_dict) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') + ... ].eval_with_dict(param_dict) + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 + +We can also filter using an arbitrary test function using +:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter +criteria are more complicated than a simple list of allowable values: .. doctest:: - >>> f64ld = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "store") - ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "load") - ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "store") - ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> def f(key): + ... from loopy.types import to_loopy_type + ... return key.dtype == to_loopy_type(np.float32) and \ + ... key.stride > 1 + >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict) + >>> print(count) + 2097152 Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_synchronization_poly` counts the number of synchronization +:func:`loopy.get_synchronization_map` counts the number of synchronization events per **thread** in a kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_synchronization_poly - >>> barrier_poly = get_synchronization_poly(knl) - >>> print(lp.stringify_stats_mapping(barrier_poly)) + >>> sync_map = lp.get_synchronization_map(knl) + >>> print(lp.stringify_stats_mapping(sync_map)) kernel_launch : { 1 } <BLANKLINE> @@ -1465,7 +1724,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1495,36 +1754,38 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; <BLANKLINE> - int const k_outer = 0; + { + int const k_outer = 0; <BLANKLINE> - for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } + for (int j = 0; j <= 9; ++j) + for (int i = 0; i <= 49; ++i) + { + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } + } } - -In this kernel, when a thread performs the second instruction it uses data produced -by *different* threads during the first instruction. Because of this, barriers are -required for correct execution, so loopy inserts them. Now we'll count the barriers -using :func:`loopy.get_barrier_poly`: +In this kernel, when a thread performs the second instruction it uses data +produced by *different* threads during the first instruction. Because of this, +barriers are required for correct execution, so loopy inserts them. Now we'll +count the barriers using :func:`loopy.get_synchronization_map`: .. doctest:: - >>> sync_map = lp.get_synchronization_poly(knl) + >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) barrier_local : { 1000 } kernel_launch : { 1 } <BLANKLINE> -Based on the kernel code printed above, we would expect each thread to encounter -50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In -this case, the number of barriers does not depend on any inames, so we can pass an -empty dictionary to :func:`islpy.eval_with_dict`. +Based on the kernel code printed above, we would expect each thread to +encounter 50x10x2 barriers, which matches the result from +:func:`loopy.get_synchronization_map`. In this case, the number of barriers +does not depend on any inames, so we can pass an empty dictionary to +:func:`islpy.eval_with_dict`. .. }}} diff --git a/loopy/__init__.py b/loopy/__init__.py index 21a41b11c9b84d288aa2cbb5146db23538613688..6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -106,16 +106,17 @@ from loopy.transform.padding import ( from loopy.transform.ilp import realize_ilp from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters +from loopy.transform.save import save_and_reload_temporaries # }}} -from loopy.preprocess import (preprocess_kernel, realize_reduction, - infer_unknown_types) +from loopy.type_inference import infer_unknown_types +from loopy.preprocess import preprocess_kernel, realize_reduction from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, - get_gmem_access_poly, - get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping, - sum_mem_access_to_bytes, +from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op, + MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, + get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, + get_synchronization_poly, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -130,7 +131,7 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, parse_fortran) from loopy.target import TargetBase, ASTBuilderBase -from loopy.target.c import CTarget +from loopy.target.c import CTarget, generate_header from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -206,6 +207,8 @@ __all__ = [ "assume", "fix_parameters", + "save_and_reload_temporaries", + # }}} "get_dot_dependency_graph", @@ -213,16 +216,18 @@ __all__ = [ "add_dtypes", "add_and_infer_dtypes", - "preprocess_kernel", "realize_reduction", "infer_unknown_types", + "infer_unknown_types", + + "preprocess_kernel", "realize_reduction", "generate_loop_schedules", "get_one_scheduled_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", - "get_DRAM_access_poly", - "get_synchronization_poly", "stringify_stats_mapping", - "sum_mem_access_to_bytes", + "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess", + "get_op_poly", "get_op_map", "get_lmem_access_poly", + "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", + "get_synchronization_poly", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", @@ -236,7 +241,9 @@ __all__ = [ "LoopyError", "LoopyWarning", - "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget", + "TargetBase", + "CTarget", "generate_header", + "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", @@ -254,7 +261,6 @@ __all__ = [ # }}} ] - # }}} @@ -274,6 +280,9 @@ def set_options(kernel, *args, **kwargs): new_opt = kernel.options.copy() if kwargs: + from loopy.options import _apply_legacy_map, Options + kwargs = _apply_legacy_map(Options._legacy_options_map, kwargs) + for key, val in six.iteritems(kwargs): if not hasattr(new_opt, key): raise ValueError("unknown option '%s'" % key) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 479b898be610f6c9694be14f2095764ff14b767c..6a4d559758bc1d7ca52e9dc4da1b7e503e22cc29 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -109,7 +109,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_arg_data.append(None) elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg \ - or arg.arg_class is ConstantArg: + or arg.arg_class is ConstantArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) @@ -422,7 +422,7 @@ def auto_test_vs_ref( # {{{ compile and run reference code - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False @@ -530,7 +530,7 @@ def auto_test_vs_ref( test_kernel_count = 0 - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 79d824a44fc04f479139f2797994621f09798297..6f312ec798e13fa4b1d183c27578089857b13e3d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six from loopy.diagnostic import LoopyError, warn -from pytools import Record +from pytools import ImmutableRecord import islpy as isl from pytools.persistent_dict import PersistentDict @@ -38,7 +38,7 @@ logger = logging.getLogger(__name__) # {{{ implemented data info -class ImplementedDataInfo(Record): +class ImplementedDataInfo(ImmutableRecord): """ .. attribute:: name @@ -91,7 +91,7 @@ class ImplementedDataInfo(Record): from loopy.types import LoopyType assert isinstance(dtype, LoopyType) - Record.__init__(self, + ImmutableRecord.__init__(self, name=name, dtype=dtype, arg_class=arg_class, @@ -127,7 +127,7 @@ class VectorizationInfo(object): self.space = space -class SeenFunction(Record): +class SeenFunction(ImmutableRecord): """ .. attribute:: name .. attribute:: c_name @@ -137,15 +137,11 @@ class SeenFunction(Record): """ def __init__(self, name, c_name, arg_dtypes): - Record.__init__(self, + ImmutableRecord.__init__(self, name=name, c_name=c_name, arg_dtypes=arg_dtypes) - def __hash__(self): - return hash((type(self),) - + tuple((f, getattr(self, f)) for f in type(self).fields)) - class CodeGenerationState(object): """ @@ -365,7 +361,7 @@ code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -class PreambleInfo(Record): +class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel .. attribute:: seen_dtypes @@ -409,7 +405,7 @@ def generate_code_v2(kernel): # }}} - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index fb254bd54480f716de54de96f6aab9a4bb427767..7cc381f11d1239cba5656a9dc7a04cddaa14a368 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -27,30 +27,24 @@ import islpy as isl from islpy import dim_type -# {{{ bounds check generator +# {{{ approximate, convex bounds check generator -def get_bounds_checks(domain, check_inames, implemented_domain, - overapproximate): +def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domain): if isinstance(domain, isl.BasicSet): domain = isl.Set.from_basic_set(domain) domain = domain.remove_redundancies() result = domain.eliminate_except(check_inames, [dim_type.set]) - if overapproximate: - # This is ok, because we're really looking for the - # projection, with no remaining constraints from - # the eliminated variables. - result = result.remove_divs() - else: - result = result.compute_divs() + # This is ok, because we're really looking for the + # projection, with no remaining constraints from + # the eliminated variables. + result = result.remove_divs() result, implemented_domain = isl.align_two(result, implemented_domain) result = result.gist(implemented_domain) - if overapproximate: - result = result.remove_divs() - else: - result = result.compute_divs() + # (see above) + result = result.remove_divs() from loopy.isl_helpers import convexify result = convexify(result) @@ -62,23 +56,33 @@ def get_bounds_checks(domain, check_inames, implemented_domain, # {{{ on which inames may a conditional depend? def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import EnterLoop, LeaveLoop + from loopy.schedule import ( + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag - result = set() + result = find_active_inames_at(kernel, sched_index) + crosses_barrier = has_barrier_within(kernel, sched_index) - for i, sched_item in enumerate(kernel.schedule): - if i >= sched_index: - break - if isinstance(sched_item, EnterLoop): - result.add(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - result.remove(sched_item.iname) + # Find our containing subkernel, grab inames for all insns from there. - for iname in kernel.all_inames(): + subkernel_index = sched_index + from loopy.schedule import CallKernel + + while not isinstance(kernel.schedule[subkernel_index], CallKernel): + subkernel_index -= 1 + + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subkernel_index) + + inames_for_subkernel = ( + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)) + + for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) - # Parallel inames are always defined, BUT: + # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # @@ -87,7 +91,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): if ( isinstance(tag, ParallelTag) - and not isinstance(tag, LocalIndexTagBase) + and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): result.add(iname) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 3378ed81ee56f97cc11f8f8998aeb67221061633..d206faad5bd84e3a1c7e7c061673f3d5d1144c84 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -150,8 +150,15 @@ def generate_code_for_sched_index(codegen_state, sched_index): return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): - return codegen_state.ast_builder.emit_barrier( - sched_item.kind, sched_item.comment) + if codegen_state.is_generating_device_code: + return codegen_state.ast_builder.emit_barrier( + sched_item.kind, sched_item.comment) + from loopy.codegen.result import CodeGenerationResult + return CodeGenerationResult( + host_program=None, + device_programs=[], + implemented_domains={}, + implemented_data_info=codegen_state.implemented_data_info) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -248,9 +255,9 @@ def build_loop_nest(codegen_state, schedule_index): # {{{ pass 2: find admissible conditional inames for each sibling schedule item - from pytools import Record + from pytools import ImmutableRecord - class ScheduleIndexInfo(Record): + class ScheduleIndexInfo(ImmutableRecord): """ .. attribute:: schedule_index .. attribute:: admissible_cond_inames @@ -301,13 +308,11 @@ def build_loop_nest(codegen_state, schedule_index): domain = isl.align_spaces( self.kernel.get_inames_domain(check_inames), self.impl_domain, obj_bigger_ok=True) - from loopy.codegen.bounds import get_bounds_checks - return get_bounds_checks(domain, - check_inames, self.impl_domain, - - # Each instruction individually gets its bounds checks, - # so we can safely overapproximate here. - overapproximate=True) + from loopy.codegen.bounds import get_approximate_convex_bounds_checks + # Each instruction individually gets its bounds checks, + # so we can safely overapproximate here. + return get_approximate_convex_bounds_checks(domain, + check_inames, self.impl_domain) def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): @@ -318,6 +323,8 @@ def build_loop_nest(codegen_state, schedule_index): recursive calls from doing anything about groups that are too small. """ + from loopy.symbolic import get_dependencies + # The rough plan here is that build_insn_group starts out with the # entirety of the current schedule item's downward siblings (i.e. all # the ones up to the next LeaveLoop). It will then iterate upward to @@ -365,6 +372,11 @@ def build_loop_nest(codegen_state, schedule_index): & sched_index_info_entries[candidate_group_length-1] .required_predicates) + current_pred_set = frozenset( + pred for pred in current_pred_set + if get_dependencies(pred) & kernel.all_inames() + <= current_iname_set) + # {{{ see which inames are actually used in group # And only generate conditionals for those. @@ -451,13 +463,13 @@ def build_loop_nest(codegen_state, schedule_index): # gen_code returns a list if bounds_checks or pred_checks: - from loopy.symbolic import constraint_to_expr + from loopy.symbolic import constraint_to_cond_expr prev_gen_code = gen_code def gen_code(inner_codegen_state): condition_exprs = [ - constraint_to_expr(cns) + constraint_to_cond_expr(cns) for cns in bounds_checks] + [ pred_chk for pred_chk in pred_checks] diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 140ec644731d570fac2e793f0c4e5ea004d165e6..c490abb6ed1635c135fc77468f27cd833b1d57b2 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -27,6 +27,7 @@ THE SOFTWARE. from six.moves import range import islpy as isl +dim_type = isl.dim_type from loopy.codegen import Unvectorizable from loopy.codegen.result import CodeGenerationResult from pymbolic.mapper.stringifier import PREC_NONE @@ -34,24 +35,27 @@ from pymbolic.mapper.stringifier import PREC_NONE def to_codegen_result( codegen_state, insn_id, domain, check_inames, required_preds, ast): - from loopy.codegen.bounds import get_bounds_checks - from loopy.symbolic import constraint_to_expr - - bounds_checks = get_bounds_checks( - domain, check_inames, - codegen_state.implemented_domain, overapproximate=False) - bounds_check_set = isl.Set.universe(domain.get_space()) \ - .add_constraints(bounds_checks) - bounds_check_set, new_implemented_domain = isl.align_two( - bounds_check_set, codegen_state.implemented_domain) - new_implemented_domain = new_implemented_domain & bounds_check_set - - if bounds_check_set.is_empty(): + # {{{ get bounds check + + chk_domain = isl.Set.from_basic_set(domain) + chk_domain = chk_domain.remove_redundancies() + chk_domain = chk_domain.eliminate_except(check_inames, [dim_type.set]) + + chk_domain, implemented_domain = isl.align_two( + chk_domain, codegen_state.implemented_domain) + chk_domain = chk_domain.gist(implemented_domain) + + # }}} + + new_implemented_domain = implemented_domain & chk_domain + + if chk_domain.is_empty(): return None - condition_exprs = [ - constraint_to_expr(cns) - for cns in bounds_checks] + condition_exprs = [] + if not chk_domain.plain_is_universe(): + from loopy.symbolic import set_to_cond_expr + condition_exprs.append(set_to_cond_expr(chk_domain)) condition_exprs.extend( required_preds - codegen_state.implemented_predicates) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 648c3fe6f5b748dcc47de5ac972bb82ce605a9a9..8ac963835ec12702f2010806d1d49062422318a2 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -386,48 +386,39 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - from loopy.isl_helpers import ( - static_min_of_pw_aff, - static_max_of_pw_aff) - lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) - static_lbound = static_min_of_pw_aff( - lbound, - constants_only=False) - static_ubound = static_max_of_pw_aff( - ubound, - constants_only=False) - # }}} - # {{{ find implemented slab, build inner code + # {{{ find implemented loop, build inner code - from loopy.isl_helpers import make_slab_from_bound_pwaffs + from loopy.isl_helpers import make_loop_bounds_from_pwaffs - # impl_slab may be overapproximated - impl_slab = make_slab_from_bound_pwaffs( + # impl_loop may be overapproximated + impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, - loop_iname, static_lbound, static_ubound) + loop_iname, lbound, ubound) for iname in moved_inames: - dt, idx = impl_slab.get_var_dict()[iname] - impl_slab = impl_slab.move_dims( - dim_type.set, impl_slab.dim(dim_type.set), + dt, idx = impl_loop.get_var_dict()[iname] + impl_loop = impl_loop.move_dims( + dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state - .intersect(impl_slab) + .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( kernel, slab, iname))) @@ -438,21 +429,30 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) - from loopy.symbolic import aff_to_expr - astb = codegen_state.ast_builder - if (static_ubound - static_lbound).plain_is_zero(): + zero = isl.PwAff.zero_on_domain( + isl.LocalSpace.from_space( + lbound.get_space()).domain()) + + from loopy.symbolic import pw_aff_to_expr + + if (ubound - lbound).plain_is_equal(zero): # single-trip, generate just a variable assignment, not a loop - result.append(merge_codegen_results(codegen_state, [ + inner = merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, - ])) + ]) + result.append( + inner.with_new_ast( + codegen_state, + astb.ast_block_scope_class( + inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) @@ -461,7 +461,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, - static_lbound, static_ubound, inner_ast))) + pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 04fab05afdc38a8843a566e0e6e6b10098d6415c..4318ad71c1b16deeaac98f8408d5ca82f2de1714 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -23,7 +23,7 @@ THE SOFTWARE. """ import six -from pytools import Record +from pytools import ImmutableRecord def process_preambles(preambles): @@ -45,7 +45,7 @@ def process_preambles(preambles): # {{{ code generation result -class GeneratedProgram(Record): +class GeneratedProgram(ImmutableRecord): """ .. attribute:: name @@ -64,7 +64,7 @@ class GeneratedProgram(Record): """ -class CodeGenerationResult(Record): +class CodeGenerationResult(ImmutableRecord): """ .. attribute:: host_program .. attribute:: device_programs @@ -207,6 +207,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): codegen_result = None block_cls = codegen_state.ast_builder.ast_block_class + block_scope_cls = codegen_state.ast_builder.ast_block_scope_class for el in elements: if isinstance(el, CodeGenerationResult): @@ -227,7 +228,8 @@ def merge_codegen_results(codegen_state, elements, collapse=True): dev_program_names.add(dp.name) cur_ast = el.current_ast(codegen_state) - if isinstance(cur_ast, block_cls): + if (isinstance(cur_ast, block_cls) + and not isinstance(cur_ast, block_scope_cls)): ast_els.extend(cur_ast.contents) else: ast_els.append(cur_ast) diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 89600102e09bb96173bb11db5c71d14dd3b2a206..29996d6c78b6fd99e52a750968291d0dd3d7c941 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -89,9 +89,7 @@ class StaticValueFindingError(LoopyError): class DependencyTypeInferenceFailure(TypeInferenceFailure): - def __init__(self, message, symbol): - TypeInferenceFailure.__init__(self, message) - self.symbol = symbol + pass class MissingBarrierError(LoopyError): diff --git a/loopy/execution.py b/loopy/execution.py index 802684247f9f95a4374838ddcfaaae0ddbadec2e..5680fdbfef614a0df1674a56842acd1869d14636 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -25,13 +25,13 @@ THE SOFTWARE. import six import numpy as np -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError # {{{ object array argument packing -class _PackingInfo(Record): +class _PackingInfo(ImmutableRecord): """ .. attribute:: name .. attribute:: sep_shape @@ -160,7 +160,7 @@ class KernelExecutorBase(object): kernel = add_dtypes(kernel, var_to_dtype) - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: diff --git a/loopy/expression.py b/loopy/expression.py index 991f4a93e30a76a09b527e4fd326cfafff5e7569..3269bc09f064f57857eaa5218c8370383e0f735e 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -25,14 +25,10 @@ THE SOFTWARE. import numpy as np -from pymbolic.mapper import CombineMapper, RecursiveMapper +from pymbolic.mapper import RecursiveMapper -from loopy.tools import is_integer -from loopy.types import NumpyType from loopy.codegen import Unvectorizable -from loopy.diagnostic import ( - LoopyError, - TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.diagnostic import LoopyError # type_context may be: @@ -57,264 +53,6 @@ def dtype_to_type_context(target, dtype): return None -# {{{ type inference - -class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): - """ - :arg new_assignments: mapping from names to either - :class:`loopy.kernel.data.TemporaryVariable` - or - :class:`loopy.kernel.data.KernelArgument` - instances - """ - self.kernel = kernel - if new_assignments is None: - new_assignments = {} - self.new_assignments = new_assignments - - # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) - # are Python-equal (for many common constants such as integers). - - def with_assignments(self, names_to_vars): - new_ass = self.new_assignments.copy() - new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) - - @staticmethod - def combine(dtypes): - # dtypes may just be a generator expr - dtypes = list(dtypes) - - from loopy.types import LoopyType, NumpyType - assert all(isinstance(dtype, LoopyType) for dtype in dtypes) - - if not all(isinstance(dtype, NumpyType) for dtype in dtypes): - from pytools import is_single_valued, single_valued - if not is_single_valued(dtypes): - raise TypeInferenceFailure( - "Nothing known about operations between '%s'" - % ", ".join(str(dt) for dt in dtypes)) - - return single_valued(dtypes) - - dtypes = [dtype.dtype for dtype in dtypes] - - result = dtypes.pop() - while dtypes: - other = dtypes.pop() - - if result.fields is None and other.fields is None: - if (result, other) in [ - (np.int32, np.float32), (np.float32, np.int32)]: - # numpy makes this a double. I disagree. - result = np.dtype(np.float32) - else: - result = ( - np.empty(0, dtype=result) - + np.empty(0, dtype=other) - ).dtype - - elif result.fields is None and other.fields is not None: - # assume the non-native type takes over - # (This is used for vector types.) - result = other - elif result.fields is not None and other.fields is None: - # assume the non-native type takes over - # (This is used for vector types.) - pass - else: - if result is not other: - raise TypeInferenceFailure( - "nothing known about result of operation on " - "'%s' and '%s'" % (result, other)) - - return NumpyType(result) - - def map_sum(self, expr): - dtypes = [] - small_integer_dtypes = [] - for child in expr.children: - dtype = self.rec(child) - if is_integer(child) and abs(child) < 1024: - small_integer_dtypes.append(dtype) - else: - dtypes.append(dtype) - - from pytools import all - if all(dtype.is_integral() for dtype in dtypes): - dtypes.extend(small_integer_dtypes) - - return self.combine(dtypes) - - map_product = map_sum - - def map_quotient(self, expr): - n_dtype = self.rec(expr.numerator) - d_dtype = self.rec(expr.denominator) - - if n_dtype.is_integral() and d_dtype.is_integral(): - # both integers - return NumpyType(np.dtype(np.float64)) - - else: - return self.combine([n_dtype, d_dtype]) - - def map_constant(self, expr): - if is_integer(expr): - for tp in [np.int32, np.int64]: - iinfo = np.iinfo(tp) - if iinfo.min <= expr <= iinfo.max: - return NumpyType(np.dtype(tp)) - - else: - raise TypeInferenceFailure("integer constant '%s' too large" % expr) - - dt = np.asarray(expr).dtype - if hasattr(expr, "dtype"): - return NumpyType(expr.dtype) - elif isinstance(expr, np.number): - # Numpy types are sized - return NumpyType(np.dtype(type(expr))) - elif dt.kind == "f": - # deduce the smaller type by default - return NumpyType(np.dtype(np.float32)) - elif dt.kind == "c": - if np.complex64(expr) == np.complex128(expr): - # (COMPLEX_GUESS_LOGIC) - # No precision is lost by 'guessing' single precision, use that. - # This at least covers simple cases like '1j'. - return NumpyType(np.dtype(np.complex64)) - - # Codegen for complex types depends on exactly correct types. - # Refuse temptation to guess. - raise TypeInferenceFailure("Complex constant '%s' needs to " - "be sized for type inference " % expr) - else: - raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) - - def map_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_linear_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_call(self, expr, multiple_types_ok=False): - from pymbolic.primitives import Variable - - identifier = expr.function - if isinstance(identifier, Variable): - identifier = identifier.name - - if identifier in ["indexof", "indexof_vec"]: - return self.kernel.index_dtype - - arg_dtypes = tuple(self.rec(par) for par in expr.parameters) - - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if multiple_types_ok: - if mangle_result is not None: - return mangle_result.result_dtypes - else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not multiple_types_ok: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") - - return mangle_result.result_dtypes[0] - - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) - - def map_variable(self, expr): - if expr.name in self.kernel.all_inames(): - return self.kernel.index_dtype - - result = self.kernel.mangle_symbol( - self.kernel.target.get_device_ast_builder(), - expr.name) - - if result is not None: - result_dtype, _ = result - return result_dtype - - obj = self.new_assignments.get(expr.name) - - if obj is None: - obj = self.kernel.arg_dict.get(expr.name) - - if obj is None: - obj = self.kernel.temporary_variables.get(expr.name) - - if obj is None: - raise TypeInferenceFailure("name not known in type inference: %s" - % expr.name) - - from loopy.kernel.data import TemporaryVariable, KernelArgument - import loopy as lp - if isinstance(obj, TemporaryVariable): - result = obj.dtype - if result is lp.auto: - raise DependencyTypeInferenceFailure( - "temporary variable '%s'" % expr.name, - expr.name) - else: - return result - - elif isinstance(obj, KernelArgument): - result = obj.dtype - if result is None: - raise DependencyTypeInferenceFailure( - "argument '%s'" % expr.name, - expr.name) - else: - return result - - else: - raise RuntimeError("unexpected type inference " - "object type for '%s'" % expr.name) - - map_tagged_variable = map_variable - - def map_lookup(self, expr): - agg_result = self.rec(expr.aggregate) - field = agg_result.numpy_dtype.fields[expr.name] - dtype = field[0] - return NumpyType(dtype) - - def map_comparison(self, expr): - # "bool" is unusable because OpenCL's bool has indeterminate memory - # format. - return NumpyType(np.dtype(np.int32)) - - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison - - def map_group_hw_index(self, expr, *args): - return self.kernel.index_dtype - - def map_local_hw_index(self, expr, *args): - return self.kernel.index_dtype - - def map_reduction(self, expr, multiple_types_ok=False): - result = expr.operation.result_dtypes( - self.kernel, self.rec(expr.expr), expr.inames) - - if multiple_types_ok: - return result - - else: - if len(result) != 1 and not multiple_types_ok: - raise LoopyError("reductions with more or fewer than one " - "return value may only be used in direct assignments") - - return result[0] - -# }}} - - # {{{ vetorizability checker class VectorizabilityChecker(RecursiveMapper): diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index e657beecbc5453ae5b2390da5a958d2fc9a70771..602830de38e457c5ff4a55d7685dc346a7b4de35 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -102,7 +102,7 @@ def make_slab(space, iname, start, stop): return result -def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): +def make_loop_bounds_from_pwaffs(space, iname, lbound, ubound): dt, pos = space.get_var_dict()[iname] iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos) @@ -111,10 +111,10 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): assert iname_pwaff.space == lbound.space assert iname_pwaff.space == ubound.space - return convexify( - iname_pwaff.ge_set(lbound) - & - iname_pwaff.le_set(ubound)) + return ( + iname_pwaff.ge_set(lbound) + & + iname_pwaff.le_set(ubound)) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d618d4b0dd9adc0ccb50827dce914571538a62a4..5b192934c8f56ccf364d1b9ba58d81fa8c28ff63 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -28,7 +28,7 @@ import six from six.moves import range, zip, intern import numpy as np -from pytools import RecordWithoutPickling, Record, memoize_method +from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method import islpy as isl from islpy import dim_type import re @@ -83,7 +83,7 @@ class kernel_state: # noqa SCHEDULED = 2 -class LoopKernel(RecordWithoutPickling): +class LoopKernel(ImmutableRecordWithoutPickling): """These correspond more or less directly to arguments of :func:`loopy.make_kernel`. @@ -259,7 +259,7 @@ class LoopKernel(RecordWithoutPickling): # }}} from loopy.types import to_loopy_type - index_dtype = to_loopy_type(index_dtype).with_target(target) + index_dtype = to_loopy_type(index_dtype, target=target) if not index_dtype.is_integral(): raise TypeError("index_dtype must be an integer") if np.iinfo(index_dtype.numpy_dtype).min >= 0: @@ -279,7 +279,7 @@ class LoopKernel(RecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT - RecordWithoutPickling.__init__(self, + ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, args=args, @@ -889,7 +889,7 @@ class LoopKernel(RecordWithoutPickling): dom_intersect_assumptions, iname_idx) .coalesce()) - class BoundsRecord(Record): + class BoundsRecord(ImmutableRecord): pass size = (upper_bound_pw_aff - lower_bound_pw_aff + 1) @@ -1056,6 +1056,19 @@ class LoopKernel(RecordWithoutPickling): # }}} + # {{{ nosync sets + + @memoize_method + def get_nosync_set(self, insn_id, scope): + assert scope in ("local", "global") + + return frozenset( + insn_id + for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with + if nosync_scope == scope or nosync_scope == "any") + + # }}} + # {{{ pretty-printing def stringify(self, what=None, with_dependencies=False): @@ -1213,7 +1226,9 @@ class LoopKernel(RecordWithoutPickling): options.append( "conflicts=%s" % ":".join(insn.conflicts_with_groups)) if insn.no_sync_with: - options.append("no_sync_with=%s" % ":".join(insn.no_sync_with)) + # FIXME: Find a syntax to express scopes. + options.append("no_sync_with=%s" % ":".join(id for id, _ in + insn.no_sync_with)) if lhs: core = "%s <- %s" % ( diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 99bbc7bf9e782fe7995c20d8d3482602c9874dc9..a02fc58d97f370d45f36a465c38fa3caf3da9d41 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -30,7 +30,7 @@ import six from six.moves import range, zip from six import iteritems -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method import numpy as np # noqa @@ -40,7 +40,7 @@ from loopy.tools import is_integer # {{{ array dimension tags -class ArrayDimImplementationTag(Record): +class ArrayDimImplementationTag(ImmutableRecord): def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. @@ -544,7 +544,7 @@ def _parse_shape_or_strides(x): return tuple(_pymbolic_parse_if_necessary(xi) for xi in x) -class ArrayBase(Record): +class ArrayBase(ImmutableRecord): """ .. attribute :: name @@ -576,6 +576,7 @@ class ArrayBase(Record): def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, for_atomic=False, + target=None, **kwargs): """ All of the following are optional. Specify either strides or shape. @@ -659,7 +660,7 @@ class ArrayBase(Record): from loopy.types import to_loopy_type dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True, - for_atomic=for_atomic) + for_atomic=for_atomic, target=target) strides_known = strides is not None and strides is not lp.auto shape_known = shape is not None and shape is not lp.auto @@ -786,7 +787,7 @@ class ArrayBase(Record): warn("dim_names is not a tuple when calling ArrayBase constructor", DeprecationWarning, stacklevel=2) - Record.__init__(self, + ImmutableRecord.__init__(self, name=name, dtype=dtype, shape=shape, @@ -1162,7 +1163,7 @@ class ArrayBase(Record): # {{{ access code generation -class AccessInfo(Record): +class AccessInfo(ImmutableRecord): """ .. attribute:: array_name .. attribute:: vector_index diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ff3bf16bcf32b26b1865d350aefbef80ec4e4554..6c5491384d4fc37dc48604aa52753d11ac10fc55 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -149,9 +149,9 @@ def expand_defines_in_expr(expr, defines): def get_default_insn_options_dict(): return { - "depends_on": None, + "depends_on": frozenset(), "depends_on_is_final": False, - "no_sync_with": None, + "no_sync_with": frozenset(), "groups": frozenset(), "conflicts_with_groups": frozenset(), "insn_id": None, @@ -221,18 +221,37 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): result["depends_on_is_final"] = True opt_value = (opt_value[1:]).strip() - result["depends_on"] = frozenset( + result["depends_on"] = result["depends_on"].union(frozenset( intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip()) + if dep.strip())) + + elif opt_key == "dep_query" and opt_value is not None: + from loopy.match import parse_match + match = parse_match(opt_value) + result["depends_on"] = result["depends_on"].union(frozenset([match])) elif opt_key == "nosync" and opt_value is not None: if is_with_block: raise LoopyError("'nosync' option may not be specified " "in a 'with' block") - result["no_sync_with"] = frozenset( - intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip()) + # TODO: Come up with a syntax that allows the user to express + # different synchronization scopes. + result["no_sync_with"] = result["no_sync_with"].union(frozenset( + (intern(dep.strip()), "any") + for dep in opt_value.split(":") if dep.strip())) + + elif opt_key == "nosync_query" and opt_value is not None: + if is_with_block: + raise LoopyError("'nosync' option may not be specified " + "in a 'with' block") + + from loopy.match import parse_match + match = parse_match(opt_value) + # TODO: Come up with a syntax that allows the user to express + # different synchronization scopes. + result["no_sync_with"] = result["no_sync_with"].union( + frozenset([(match, "any")])) elif opt_key == "groups" and opt_value is not None: result["groups"] = frozenset( @@ -555,10 +574,16 @@ def parse_instructions(instructions, defines): continue elif isinstance(insn, InstructionBase): + def intern_if_str(s): + if isinstance(s, str): + return intern(s) + else: + return s + new_instructions.append( insn.copy( id=intern(insn.id) if isinstance(insn.id, str) else insn.id, - depends_on=frozenset(intern(dep) for dep in insn.depends_on), + depends_on=frozenset(intern_if_str(dep) for dep in insn.depends_on), groups=frozenset(intern(grp) for grp in insn.groups), conflicts_with_groups=frozenset( intern(grp) for grp in insn.conflicts_with_groups), @@ -1244,7 +1269,8 @@ def create_temporaries(knl, default_order): scope=lp.auto, base_indices=lp.auto, shape=lp.auto, - order=default_order) + order=default_order, + target=knl.target) if isinstance(insn, Assignment): insn = insn.copy(temp_var_type=None) @@ -1412,43 +1438,40 @@ def apply_default_order_to_args(kernel, default_order): # }}} -# {{{ resolve wildcard insn dependencies - -def find_matching_insn_ids(knl, dep): - from fnmatch import fnmatchcase +# {{{ resolve instruction dependencies - return [ - other_insn.id - for other_insn in knl.instructions - if fnmatchcase(other_insn.id, dep)] +def _resolve_dependencies(knl, insn, deps): + from loopy import find_instructions + from loopy.match import MatchExpressionBase - -def resove_wildcard_insn_ids(knl, deps): new_deps = [] - for dep in deps: - matches = find_matching_insn_ids(knl, dep) - if matches: - new_deps.extend(matches) + for dep in deps: + if isinstance(dep, MatchExpressionBase): + for new_dep in find_instructions(knl, dep): + if new_dep.id != insn.id: + new_deps.append(new_dep.id) else: - # Uh, best we can do - new_deps.append(dep) + from fnmatch import fnmatchcase + for other_insn in knl.instructions: + if fnmatchcase(other_insn.id, dep): + new_deps.append(other_insn.id) return frozenset(new_deps) -def resolve_wildcard_deps(knl): +def resolve_dependencies(knl): new_insns = [] for insn in knl.instructions: - if insn.depends_on is not None: - insn = insn.copy( - depends_on=resove_wildcard_insn_ids(knl, insn.depends_on), - no_sync_with=resove_wildcard_insn_ids( - knl, insn.no_sync_with), - ) - - new_insns.append(insn) + new_insns.append(insn.copy( + depends_on=_resolve_dependencies(knl, insn, insn.depends_on), + no_sync_with=frozenset( + (resolved_insn_id, nosync_scope) + for nosync_dep, nosync_scope in insn.no_sync_with + for resolved_insn_id in + _resolve_dependencies(knl, insn, nosync_dep)), + )) return knl.copy(instructions=new_insns) @@ -1785,7 +1808,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) - knl = resolve_wildcard_deps(knl) + knl = resolve_dependencies(knl) knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=False) # ------------------------------------------------------------------------- diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 004fae7f9664ff62c34a994671ea792e4eddc836..61be55ca88b105f2cf58e8aeace09e9c20f54857 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import intern import numpy as np # noqa -from pytools import Record +from pytools import ImmutableRecord from loopy.kernel.array import ArrayBase from loopy.diagnostic import LoopyError from loopy.kernel.instruction import ( # noqa @@ -54,7 +54,7 @@ class auto(object): # noqa # {{{ iname tags -class IndexTag(Record): +class IndexTag(ImmutableRecord): __slots__ = [] def __hash__(self): @@ -93,7 +93,7 @@ class AxisTag(UniqueTag): __slots__ = ["axis"] def __init__(self, axis): - Record.__init__(self, + ImmutableRecord.__init__(self, axis=axis) @property @@ -197,21 +197,24 @@ def parse_tag(tag): # {{{ arguments -class KernelArgument(Record): +class KernelArgument(ImmutableRecord): """Base class for all argument types""" def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) + target = kwargs.pop("target", None) + dtype = kwargs.pop("dtype", None) from loopy.types import to_loopy_type kwargs["dtype"] = to_loopy_type( - dtype, allow_auto=True, allow_none=True) + dtype, allow_auto=True, allow_none=True, target=target) - Record.__init__(self, **kwargs) + ImmutableRecord.__init__(self, **kwargs) class GlobalArg(ArrayBase, KernelArgument): + __doc__ = ArrayBase.__doc__ min_target_axes = 0 max_target_axes = 1 @@ -221,6 +224,7 @@ class GlobalArg(ArrayBase, KernelArgument): class ConstantArg(ArrayBase, KernelArgument): + __doc__ = ArrayBase.__doc__ min_target_axes = 0 max_target_axes = 1 @@ -230,6 +234,7 @@ class ConstantArg(ArrayBase, KernelArgument): class ImageArg(ArrayBase, KernelArgument): + __doc__ = ArrayBase.__doc__ min_target_axes = 1 max_target_axes = 3 @@ -243,11 +248,11 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument): - def __init__(self, name, dtype=None, approximately=1000): - from loopy.types import to_loopy_type + def __init__(self, name, dtype=None, approximately=1000, target=None): KernelArgument.__init__(self, name=name, - dtype=to_loopy_type(dtype, allow_auto=True, allow_none=True), - approximately=approximately) + dtype=dtype, + approximately=approximately, + target=target) def __str__(self): import loopy as lp @@ -509,7 +514,7 @@ class TemporaryVariable(ArrayBase): # {{{ subsitution rule -class SubstitutionRule(Record): +class SubstitutionRule(ImmutableRecord): """ .. attribute:: name .. attribute:: arguments @@ -522,7 +527,7 @@ class SubstitutionRule(Record): def __init__(self, name, arguments, expression): assert isinstance(arguments, tuple) - Record.__init__(self, + ImmutableRecord.__init__(self, name=name, arguments=arguments, expression=expression) def __str__(self): @@ -543,7 +548,7 @@ class SubstitutionRule(Record): # {{{ function call mangling -class CallMangleInfo(Record): +class CallMangleInfo(ImmutableRecord): """ .. attribute:: target_name diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index c54d1fc329a3a8797b17458dc40e489044e9374a..93642103e50da5aabfcb7bd86cc50ce6ff903a18 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -23,14 +23,14 @@ THE SOFTWARE. """ from six.moves import intern -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError from warnings import warn # {{{ instructions: base class -class InstructionBase(Record): +class InstructionBase(ImmutableRecord): """A base class for all types of instruction that can occur in a kernel. @@ -51,6 +51,17 @@ class InstructionBase(Record): May be *None* to invoke the default. + There are two extensions to this: + + - You may use `*` as a wildcard in the given IDs. This will be expanded + to all matching instruction IDs during :func:`loopy.make_kernel`. + - Instead of an instruction ID, you may pass an instance of + :class:`loopy.match.MatchExpressionBase` into the :attr:`depends_on` + :class:`frozenset`. The given expression will be used to add any + matching instructions in the kernel to :attr:`depends_on` during + :func:`loopy.make_kernel`. Note, that this is not meant as a user-facing + interface. + .. attribute:: depends_on_is_final A :class:`bool` determining whether :attr:`depends_on` constitutes @@ -80,9 +91,20 @@ class InstructionBase(Record): .. attribute:: no_sync_with - a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - with which no barrier synchronization is necessary, even given the existence - of a dependency chain and apparently conflicting access + a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where + `insn_id` refers to :attr:`id` of :class:`Instruction` instances + and `scope` is one of the following strings: + + - `"local"` + - `"global"` + - `"any"`. + + This indicates no barrier synchronization is necessary with the given + instruction using barriers of type `scope`, even given the existence of + a dependency chain and apparently conflicting access. + + Note, that :attr:`no_sync_with` allows instruction matching through wildcards + and match expression, just like :attr:`depends_on`. .. rubric:: Conditionals @@ -177,7 +199,7 @@ class InstructionBase(Record): new_predicates.add(pred) - predicates = new_predicates + predicates = frozenset(new_predicates) del new_predicates # }}} @@ -233,7 +255,7 @@ class InstructionBase(Record): assert isinstance(groups, frozenset) assert isinstance(conflicts_with_groups, frozenset) - Record.__init__(self, + ImmutableRecord.__init__(self, id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, @@ -366,7 +388,10 @@ class InstructionBase(Record): if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: - result.append("nosync="+":".join(self.no_sync_with)) + # TODO: Come up with a syntax to express different kinds of + # synchronization scopes. + result.append("nosync="+":".join( + insn_id for insn_id, _ in self.no_sync_with)) if self.groups: result.append("groups=%s" % ":".join(self.groups)) if self.conflicts_with_groups: @@ -382,19 +407,6 @@ class InstructionBase(Record): # {{{ comparison, hashing - def __eq__(self, other): - if not type(self) == type(other): - return False - - for field_name in self.fields: - if getattr(self, field_name) != getattr(other, field_name): - return False - - return True - - def __ne__(self, other): - return not self.__eq__(other) - def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with :class:`pytools.persistent_dict.PersistentDict`. @@ -1159,7 +1171,7 @@ class CInstruction(InstructionBase): for name, expr in self.iname_exprs], assignees=[f(a, *args) for a in self.assignees], predicates=frozenset( - f(pred) for pred in self.predicates)) + f(pred, *args) for pred in self.predicates)) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 7e9bd549fede6abf6d4d5db99896063b34246793..cbacf5e284fe42ae2b5605d12fa3582bcf0ac4fd 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -70,7 +70,7 @@ def _add_dtypes(knl, dtype_dict): for arg in knl.args: new_dtype = dtype_dict.pop(arg.name, None) if new_dtype is not None: - new_dtype = to_loopy_type(new_dtype) + new_dtype = to_loopy_type(new_dtype, target=knl.target) if arg.dtype is not None and arg.dtype != new_dtype: raise RuntimeError( "argument '%s' already has a different dtype " @@ -116,14 +116,14 @@ def add_and_infer_dtypes(knl, dtype_dict): knl = add_dtypes(knl, processed_dtype_dict) - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types return infer_unknown_types(knl, expect_completion=True) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): knl = _add_dtypes_overdetermined(knl, dtype_dict) - from loopy.preprocess import infer_unknown_types + from loopy.type_inference import infer_unknown_types return infer_unknown_types(knl, expect_completion=True) # }}} diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 7d04b8c7330f88af9ee1d79fe19fd87b29b70050..b8633114ddeb9d48eb33a765755302917ca27f63 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -25,14 +25,14 @@ THE SOFTWARE. """ -from pytools import Record +from pytools import ImmutableRecord from mako.template import Template import numpy as np # {{{ rng metadata -class RNGInfo(Record): +class RNGInfo(ImmutableRecord): @property def full_name(self): return "%s%dx%d" % (self.name, self.width, self.bits) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 8a38eebd55b003c624b386bcdf296d2b97e2c97c..f435820b23e8da909f0cff14ff5a1272874e865f 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -37,6 +37,11 @@ class ReductionOperation(object): """ def result_dtypes(self, target, arg_dtype, inames): + """ + :arg arg_dtype: may be None if not known + :returns: None if not known, otherwise the returned type + """ + raise NotImplementedError def neutral_element(self, dtype, inames): @@ -87,6 +92,9 @@ class ScalarReductionOperation(ReductionOperation): return (self.parse_result_type( kernel.target, self.forced_result_type),) + if arg_dtype is None: + return None + return (arg_dtype,) def __hash__(self): diff --git a/loopy/match.py b/loopy/match.py index 053fc9d4883d97b9184d85429aac3b6507d28e0e..ab0038af8dc5e9189a382bb76115998f57aef74e 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -58,6 +58,7 @@ def re_from_glob(s): from fnmatch import translate return re.compile("^"+translate(s.strip())+"$") + # {{{ parsing # {{{ lexer data @@ -72,7 +73,7 @@ _id = intern("_id") _tag = intern("_tag") _writes = intern("_writes") _reads = intern("_reads") -_iname = intern("_reads") +_iname = intern("_iname") _whitespace = intern("_whitespace") @@ -107,6 +108,8 @@ _PREC_NOT = 30 # }}} +# }}} + # {{{ match expression @@ -137,6 +140,9 @@ class All(MatchExpressionBase): def __eq__(self, other): return (type(self) == type(other)) + def __hash__(self): + return hash(type(self)) + class And(MatchExpressionBase): def __init__(self, children): @@ -156,6 +162,9 @@ class And(MatchExpressionBase): return (type(self) == type(other) and self.children == other.children) + def __hash__(self): + return hash((type(self), self.children)) + class Or(MatchExpressionBase): def __init__(self, children): @@ -175,6 +184,9 @@ class Or(MatchExpressionBase): return (type(self) == type(other) and self.children == other.children) + def __hash__(self): + return hash((type(self), self.children)) + class Not(MatchExpressionBase): def __init__(self, child): @@ -194,6 +206,9 @@ class Not(MatchExpressionBase): return (type(self) == type(other) and self.child == other.child) + def __hash__(self): + return hash((type(self), self.child)) + class GlobMatchExpressionBase(MatchExpressionBase): def __init__(self, glob): @@ -215,6 +230,9 @@ class GlobMatchExpressionBase(MatchExpressionBase): return (type(self) == type(other) and self.glob == other.glob) + def __hash__(self): + return hash((type(self), self.glob)) + class Id(GlobMatchExpressionBase): def __call__(self, kernel, matchable): @@ -244,7 +262,7 @@ class Reads(GlobMatchExpressionBase): class Iname(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) - for name in matchable.inames(kernel)) + for name in matchable.within_inames) # }}} @@ -350,8 +368,6 @@ def parse_match(expr): # }}} -# }}} - # {{{ stack match objects diff --git a/loopy/options.py b/loopy/options.py index 5db1be64624c027a6579f28c99db1bb4e78e3bc3..33b216e1ee7fe95b0930af2643ac3ccc5693a4ee 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -23,16 +23,49 @@ THE SOFTWARE. """ -from pytools import Record +import six +from pytools import ImmutableRecord import re +ALLOW_TERMINAL_COLORS = False + + class _ColoramaStub(object): def __getattribute__(self, name): return "" -class Options(Record): +def _apply_legacy_map(lmap, kwargs): + result = {} + + for name, val in six.iteritems(kwargs): + try: + lmap_value = lmap[name] + except KeyError: + new_name = name + else: + if lmap_value is None: + # ignore this + from warnings import warn + warn("option '%s' is deprecated and was ignored" % name, + DeprecationWarning) + continue + + new_name, translator = lmap_value + if name in result: + raise TypeError("may not pass a value for both '%s' and '%s'" + % (name, new_name)) + + if translator is not None: + val = translator(val) + + result[new_name] = val + + return result + + +class Options(ImmutableRecord): """ Unless otherwise specified, these options are Boolean-valued (i.e. on/off). @@ -91,30 +124,21 @@ class Options(Record): Accepts a file name as a value. Writes to ``sys.stdout`` if none is given. - .. attribute:: highlight_wrapper - - Use syntax highlighting in :attr:`write_wrapper`. - - .. attribute:: write_cl - - Print the generated OpenCL kernel. - Accepts a file name as a value. Writes to - ``sys.stdout`` if none is given. - - .. attribute:: highlight_cl + .. attribute:: write_code - Use syntax highlighting in :attr:`write_cl`. + Print the generated code. Accepts a file name or a boolean as a value. + Writes to ``sys.stdout`` if set to *True*. - .. attribute:: edit_cl + .. attribute:: edit_code Invoke an editor (given by the environment variable :envvar:`EDITOR`) on the generated kernel code, allowing for tweaks before the code is passed on to - the OpenCL implementation for compilation. + the target for compilation. - .. attribute:: cl_build_options + .. attribute:: build_options - Options to pass to the OpenCL compiler when building the kernel. + Options to pass to the target compiler when building the kernel. A list of strings. .. attribute:: allow_terminal_colors @@ -126,6 +150,16 @@ class Options(Record): .. attribute:: disable_global_barriers """ + _legacy_options_map = { + "cl_build_options": ("build_options", None), + "write_cl": ("write_code", None), + "highlight_cl": None, + "highlight_wrapper": None, + "disable_wrapper_highlight": None, + "disable_code_highlight": None, + "edit_cl": ("edit_code", None), + } + def __init__( # All Boolean flags in here should default to False for the # string-based interface of make_options (below) to make sense. @@ -133,46 +167,65 @@ class Options(Record): # All defaults are further required to be False when cast to bool # for the update() functionality to work. - self, - - annotate_inames=False, - trace_assignments=False, - trace_assignment_values=False, - ignore_boostable_into=False, - - skip_arg_checks=False, no_numpy=False, return_dict=False, - write_wrapper=False, highlight_wrapper=False, - write_cl=False, highlight_cl=False, - edit_cl=False, cl_build_options=[], - allow_terminal_colors=None, - disable_global_barriers=False, - ): - - if allow_terminal_colors is None: - try: - import colorama # noqa - except ImportError: - allow_terminal_colors = False - else: - allow_terminal_colors = True + self, **kwargs): + + kwargs = _apply_legacy_map(self._legacy_options_map, kwargs) + + try: + import colorama # noqa + except ImportError: + allow_terminal_colors_def = False + else: + allow_terminal_colors_def = True + + allow_terminal_colors_def = ( + ALLOW_TERMINAL_COLORS and allow_terminal_colors_def) - Record.__init__( + ImmutableRecord.__init__( self, - annotate_inames=annotate_inames, - trace_assignments=trace_assignments, - trace_assignment_values=trace_assignment_values, - ignore_boostable_into=ignore_boostable_into, - - skip_arg_checks=skip_arg_checks, no_numpy=no_numpy, - return_dict=return_dict, - write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper, - write_cl=write_cl, highlight_cl=highlight_cl, - edit_cl=edit_cl, cl_build_options=cl_build_options, - allow_terminal_colors=allow_terminal_colors, - disable_global_barriers=disable_global_barriers, + annotate_inames=kwargs.get("annotate_inames", False), + trace_assignments=kwargs.get("trace_assignments", False), + trace_assignment_values=kwargs.get("trace_assignment_values", False), + ignore_boostable_into=kwargs.get("ignore_boostable_into", False), + + skip_arg_checks=kwargs.get("skip_arg_checks", False), + no_numpy=kwargs.get("no_numpy", False), + return_dict=kwargs.get("return_dict", False), + write_wrapper=kwargs.get("write_wrapper", False), + write_code=kwargs.get("write_code", False), + edit_code=kwargs.get("edit_code", False), + build_options=kwargs.get("build_options", []), + allow_terminal_colors=kwargs.get("allow_terminal_colors", + allow_terminal_colors_def), + disable_global_barriers=kwargs.get("disable_global_barriers", + False), ) + # {{{ legacy compatibility + + @property + def edit_cl(self): + return self.edit_code + + @property + def cl_build_options(self): + return self.build_options + + @property + def highlight_cl(self): + return self.allow_terminal_colors + + @property + def highlight_wrapper(self): + return self.allow_terminal_colors + + @property + def write_cl(self): + return self.write_code + + # }}} + def update(self, other): for f in self.__class__.fields: setattr(self, f, getattr(self, f) or getattr(other, f)) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c0f42e55aaf7710a8a91781cb2f0d0af905871dd..6b5488a20bc9d714fb5fde908b559ddebf4b9591 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -35,6 +35,8 @@ from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment +# for the benefit of loopy.statistics, for now +from loopy.type_inference import infer_unknown_types import logging logger = logging.getLogger(__name__) @@ -70,6 +72,24 @@ def prepare_for_caching(kernel): # }}} +# {{{ check for writes to predicates + +def check_for_writes_to_predicates(kernel): + from loopy.symbolic import get_dependencies + for insn in kernel.instructions: + pred_vars = ( + frozenset.union( + *(get_dependencies(pred) for pred in insn.predicates)) + if insn.predicates else frozenset()) + written_pred_vars = frozenset(insn.assignee_var_names()) & pred_vars + if written_pred_vars: + raise LoopyError("In instruction '%s': may not write to " + "variable(s) '%s' involved in the instruction's predicates" + % (insn.id, ", ".join(written_pred_vars))) + +# }}} + + # {{{ check reduction iname uniqueness def check_reduction_iname_uniqueness(kernel): @@ -109,193 +129,6 @@ def check_reduction_iname_uniqueness(kernel): # }}} -# {{{ infer types - -def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): - if var_name in kernel.all_params(): - return kernel.index_dtype, [] - - def debug(s): - logger.debug("%s: %s" % (kernel.name, s)) - - dtypes = [] - - import loopy as lp - - symbols_with_unavailable_types = [] - - from loopy.diagnostic import DependencyTypeInferenceFailure - for writer_insn_id in kernel.writer_map().get(var_name, []): - writer_insn = kernel.id_to_insn[writer_insn_id] - if not isinstance(writer_insn, lp.MultiAssignmentBase): - continue - - expr = subst_expander(writer_insn.expression) - - try: - debug(" via expr %s" % expr) - if isinstance(writer_insn, lp.Assignment): - result = type_inf_mapper(expr) - elif isinstance(writer_insn, lp.CallInstruction): - result_dtypes = type_inf_mapper(expr, multiple_types_ok=True) - - result = None - for assignee, comp_dtype in zip( - writer_insn.assignee_var_names(), result_dtypes): - if assignee == var_name: - result = comp_dtype - break - - assert result is not None - - debug(" result: %s" % result) - - dtypes.append(result) - - except DependencyTypeInferenceFailure as e: - debug(" failed: %s" % e) - symbols_with_unavailable_types.append(e.symbol) - - if not dtypes: - return None, symbols_with_unavailable_types - - result = type_inf_mapper.combine(dtypes) - - return result, [] - - -class _DictUnionView: - def __init__(self, children): - self.children = children - - def get(self, key): - try: - return self[key] - except KeyError: - return None - - def __getitem__(self, key): - for ch in self.children: - try: - return ch[key] - except KeyError: - pass - - raise KeyError(key) - - -def infer_unknown_types(kernel, expect_completion=False): - """Infer types on temporaries and arguments.""" - - logger.debug("%s: infer types" % kernel.name) - - def debug(s): - logger.debug("%s: %s" % (kernel.name, s)) - - unexpanded_kernel = kernel - if kernel.substitutions: - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - - new_temp_vars = kernel.temporary_variables.copy() - new_arg_dict = kernel.arg_dict.copy() - - # {{{ fill queue - - # queue contains temporary variables - queue = [] - - import loopy as lp - for tv in six.itervalues(kernel.temporary_variables): - if tv.dtype is lp.auto: - queue.append(tv) - - for arg in kernel.args: - if arg.dtype is None: - queue.append(arg) - - # }}} - - from loopy.expression import TypeInferenceMapper - type_inf_mapper = TypeInferenceMapper(kernel, - _DictUnionView([ - new_temp_vars, - new_arg_dict - ])) - - from loopy.symbolic import SubstitutionRuleExpander - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - - # {{{ work on type inference queue - - from loopy.kernel.data import TemporaryVariable, KernelArgument - - failed_names = set() - while queue: - item = queue.pop(0) - - debug("inferring type for %s %s" % (type(item).__name__, item.name)) - - result, symbols_with_unavailable_types = \ - _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) - - failed = result is None - if not failed: - debug(" success: %s" % result) - if isinstance(item, TemporaryVariable): - new_temp_vars[item.name] = item.copy(dtype=result) - elif isinstance(item, KernelArgument): - new_arg_dict[item.name] = item.copy(dtype=result) - else: - raise LoopyError("unexpected item type in type inference") - else: - debug(" failure") - - if failed: - if item.name in failed_names: - # this item has failed before, give up. - advice = "" - if symbols_with_unavailable_types: - advice += ( - " (need type of '%s'--check for missing arguments)" - % ", ".join(symbols_with_unavailable_types)) - - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. - break - - # remember that this item failed - failed_names.add(item.name) - - queue_names = set(qi.name for qi in queue) - - if queue_names == failed_names: - # We did what we could... - print(queue_names, failed_names, item.name) - assert not expect_completion - break - - # can't infer type yet, put back into queue - queue.append(item) - else: - # we've made progress, reset failure markers - failed_names = set() - - # }}} - - return unexpanded_kernel.copy( - temporary_variables=new_temp_vars, - args=[new_arg_dict[arg.name] for arg in kernel.args], - ) - -# }}} - - # {{{ decide temporary scope def _get_compute_inames_tagged(kernel, insn, tag_base): @@ -462,7 +295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() - from loopy.expression import TypeInferenceMapper + from loopy.type_inference import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) # {{{ sequential @@ -626,7 +459,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, - no_sync_with=frozenset([init_id])) + no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): @@ -684,7 +517,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): istage += 1 new_insn_add_depends_on.add(prev_id) - new_insn_add_no_sync_with.add(prev_id) + new_insn_add_no_sync_with.add((prev_id, "any")) new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: @@ -1061,6 +894,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_unknown_types(kernel, expect_completion=False) + check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ffef62cf8b4f7cc65a3c11e06c7d42c23d18eafd..c8174d94cf9f86bde574b3e1eff353d26438cab8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -24,7 +24,7 @@ THE SOFTWARE. import six -from pytools import Record +from pytools import ImmutableRecord import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa @@ -39,7 +39,7 @@ logger = logging.getLogger(__name__) # {{{ schedule items -class ScheduleItem(Record): +class ScheduleItem(ImmutableRecord): __slots__ = [] def update_persistent_hash(self, key_hash, key_builder): @@ -399,6 +399,17 @@ def get_priority_tiers(wanted, priorities): for tier in get_priority_tiers(wanted, priorities): yield tier + +def sched_item_to_insn_id(sched_item): + # Helper for use in generator expressions, i.e. + # (... for insn_id in sched_item_to_insn_id(item) ...) + if isinstance(sched_item, RunInstruction): + yield sched_item.insn_id + elif isinstance(sched_item, Barrier): + if (hasattr(sched_item, "originating_insn_id") + and sched_item.originating_insn_id is not None): + yield sched_item.originating_insn_id + # }}} @@ -541,7 +552,7 @@ class ScheduleDebugInput(Exception): # {{{ scheduling algorithm -class SchedulerState(Record): +class SchedulerState(ImmutableRecord): """ .. attribute:: kernel @@ -572,12 +583,37 @@ class SchedulerState(Record): A :class:`frozenset` of all inames ever entered. + .. attribute:: enclosing_subkernel_inames + + The inames of the last entered subkernel + .. attribute:: schedule .. attribute:: scheduled_insn_ids .. attribute:: unscheduled_insn_ids + .. attribute:: preschedule + + A sequence of schedule items that must be inserted into the + schedule, maintaining the same ordering + + .. attribute:: prescheduled_insn_ids + + A :class:`frozenset` of any instruction that started prescheduled + + .. attribute:: prescheduled_inames + + A :class:`frozenset` of any iname that started prescheduled + + .. attribute:: may_schedule_global_barriers + + Whether global barrier scheduling is allowed + + .. attribute:: within_subkernel + + Whether the scheduler is inside a subkernel + .. attribute:: group_insn_counts A mapping from instruction group names to the number of instructions @@ -619,6 +655,11 @@ def generate_loop_schedules_internal( active_inames_set = frozenset(sched_state.active_inames) + next_preschedule_item = ( + sched_state.preschedule[0] + if len(sched_state.preschedule) > 0 + else None) + # {{{ decide about debug mode debug_mode = False @@ -637,6 +678,10 @@ def generate_loop_schedules_internal( print(75*"=") print("CURRENT SCHEDULE:") print(dump_schedule(sched_state.kernel, sched_state.schedule)) + if sched_state.preschedule: + print(75*"=") + print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") + print(dump_schedule(sched_state.kernel, sched_state.preschedule)) #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") @@ -652,6 +697,54 @@ def generate_loop_schedules_internal( # }}} + # {{{ see if we have reached the start/end of kernel in the preschedule + + if isinstance(next_preschedule_item, CallKernel): + assert sched_state.within_subkernel is False + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + within_subkernel=True, + may_schedule_global_barriers=False, + enclosing_subkernel_inames=sched_state.active_inames), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + if isinstance(next_preschedule_item, ReturnFromKernel): + assert sched_state.within_subkernel is True + # Make sure all subkernel inames have finished. + if sched_state.active_inames == sched_state.enclosing_subkernel_inames: + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + within_subkernel=False, + may_schedule_global_barriers=True), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + + # {{{ see if there are pending local barriers in the preschedule + + # Local barriers do not have associated instructions, so they need to + # be handled separately from instructions. + if ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "local"): + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:]), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + # {{{ see if any insns are ready to be scheduled now # Also take note of insns that have a chance of being schedulable inside @@ -667,9 +760,16 @@ def generate_loop_schedules_internal( # schedule generation order. return (insn.priority, len(active_groups & insn.groups), insn.id) - insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids, + insn_ids_to_try = sorted( + # Non-prescheduled instructions go first. + sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids, key=insn_sort_key, reverse=True) + insn_ids_to_try.extend( + insn_id + for item in sched_state.preschedule + for insn_id in sched_item_to_insn_id(item)) + for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] @@ -705,6 +805,46 @@ def generate_loop_schedules_internal( print("instruction '%s' won't work under inames '%s'" % (format_insn(kernel, insn.id), ",".join(have-want))) + # {{{ check if scheduling this insn is compatible with preschedule + + if insn_id in sched_state.prescheduled_insn_ids: + if isinstance(next_preschedule_item, RunInstruction): + next_preschedule_insn_id = next_preschedule_item.insn_id + elif ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "global"): + assert hasattr(next_preschedule_item, "originating_insn_id") + assert next_preschedule_item.originating_insn_id is not None + next_preschedule_insn_id = next_preschedule_item.originating_insn_id + else: + next_preschedule_insn_id = None + + if next_preschedule_insn_id != insn_id: + if debug_mode: + print("can't schedule '%s' because another preschedule " + "instruction precedes it" % format_insn(kernel, insn.id)) + is_ready = False + + # }}} + + # {{{ check if scheduler state allows insn scheduling + + from loopy.kernel.instruction import BarrierInstruction + if isinstance(insn, BarrierInstruction) and insn.kind == "global": + if not sched_state.may_schedule_global_barriers: + if debug_mode: + print("can't schedule '%s' because global barriers are " + "not currently allowed" % format_insn(kernel, insn.id)) + is_ready = False + else: + if not sched_state.within_subkernel: + if debug_mode: + print("can't schedule '%s' because not within subkernel" + % format_insn(kernel, insn.id)) + is_ready = False + + # }}} + # {{{ determine group-based readiness if insn.conflicts_with_groups & active_groups: @@ -761,6 +901,10 @@ def generate_loop_schedules_internal( unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, schedule=( sched_state.schedule + (RunInstruction(insn_id=insn.id),)), + preschedule=( + sched_state.preschedule + if insn_id not in sched_state.prescheduled_insn_ids + else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, uses_of_boostability=( sched_state.uses_of_boostability @@ -790,7 +934,17 @@ def generate_loop_schedules_internal( if last_entered_loop is not None: can_leave = True - if last_entered_loop not in sched_state.breakable_inames: + if ( + last_entered_loop in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, LeaveLoop) + and next_preschedule_item.iname == last_entered_loop)): + # A prescheduled loop can only be left if the preschedule agrees. + if debug_mode: + print("cannot leave '%s' because of preschedule constraints" + % last_entered_loop) + can_leave = False + elif last_entered_loop not in sched_state.breakable_inames: # If the iname is not breakable, then check that we've # scheduled all the instructions that require it. @@ -857,12 +1011,19 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: + for sub_sched in generate_loop_schedules_internal( sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), - active_inames=sched_state.active_inames[:-1]), + active_inames=sched_state.active_inames[:-1], + preschedule=( + sched_state.preschedule + if last_entered_loop + not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), + ), allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -902,6 +1063,38 @@ def generate_loop_schedules_internal( # {{{ check if scheduling this iname now is allowed/plausible + if ( + iname in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, EnterLoop) + and next_preschedule_item.iname == iname)): + if debug_mode: + print("scheduling %s prohibited by preschedule constraints" + % iname) + continue + + if ( + not sched_state.within_subkernel + and iname not in sched_state.prescheduled_inames): + # Avoid messing up some orderings such as picking: + # + # EnterLoop(temporary.reload) + # CallKernel + # ... + # + # instead of + # + # CallKernel + # EnterLoop(temporary.reload) + # ... + # + # This serves a heuristic to catch some bad decisions early, the + # scheduler will not allow the first variant regardless. + if debug_mode: + print("scheduling '%s' prohibited because we are outside " + "a subkernel" % iname) + continue + currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) if ( @@ -1063,6 +1256,10 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + preschedule=( + sched_state.preschedule + if iname not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), ), allow_boost=rec_allow_boost, debug=debug): @@ -1082,7 +1279,10 @@ def generate_loop_schedules_internal( if inp: raise ScheduleDebugInput(inp) - if not sched_state.active_inames and not sched_state.unscheduled_insn_ids: + if ( + not sched_state.active_inames + and not sched_state.unscheduled_insn_ids + and not sched_state.preschedule): # if done, yield result debug.log_success(sched_state.schedule) @@ -1138,7 +1338,8 @@ def convert_barrier_instructions_to_barriers(kernel, schedule): if isinstance(insn, BarrierInstruction): result.append(Barrier( kind=insn.kind, - originating_insn_id=insn.id)) + originating_insn_id=insn.id, + comment="Barrier inserted due to %s" % insn.id)) continue result.append(sched_item) @@ -1150,7 +1351,7 @@ def convert_barrier_instructions_to_barriers(kernel, schedule): # {{{ barrier insertion/verification -class DependencyRecord(Record): +class DependencyRecord(ImmutableRecord): """ .. attribute:: source @@ -1175,7 +1376,7 @@ class DependencyRecord(Record): """ def __init__(self, source, target, dep_descr, variable, var_kind): - Record.__init__(self, + ImmutableRecord.__init__(self, source=source, target=target, dep_descr=dep_descr, @@ -1209,8 +1410,8 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if reverse: source, target = target, source - if source.id in target.no_sync_with: - return None + if source.id in kernel.get_nosync_set(target.id, var_kind): + return # {{{ check that a dependency exists @@ -1309,6 +1510,9 @@ def get_tail_starting_at_last_barrier(schedule, kind): elif isinstance(sched_item, (EnterLoop, LeaveLoop)): pass + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + pass + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1322,7 +1526,8 @@ def insn_ids_from_schedule(schedule): if isinstance(sched_item, RunInstruction): result.append(sched_item.insn_id) - elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier)): + elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel, + ReturnFromKernel)): pass else: @@ -1455,8 +1660,22 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): source=dep_src_insn_id, reverse=reverse, var_kind=kind) if dep: - issue_barrier(dep=dep) - break + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no" + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + kind)) + else: + issue_barrier(dep=dep) + break # }}} @@ -1500,7 +1719,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): "Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no" + "instruction option to state that no " "synchronization is needed)" % ( dep.dep_descr.format( @@ -1515,6 +1734,10 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): result.append(sched_item) candidates.add(sched_item.insn_id) + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + result.append(sched_item) + i += 1 + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1536,7 +1759,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): def generate_loop_schedules(kernel, debug_args={}): from loopy.kernel import kernel_state - if kernel.state != kernel_state.PREPROCESSED: + if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1547,6 +1770,18 @@ def generate_loop_schedules(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) + preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else () + + prescheduled_inames = set( + insn.iname + for insn in preschedule + if isinstance(insn, EnterLoop)) + + prescheduled_insn_ids = set( + insn_id + for item in preschedule + for insn_id in sched_item_to_insn_id(item)) + from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag ilp_inames = set( iname @@ -1573,14 +1808,22 @@ def generate_loop_schedules(kernel, debug_args={}): ilp_inames=ilp_inames, vec_inames=vec_inames, + prescheduled_inames=prescheduled_inames, + prescheduled_insn_ids=prescheduled_insn_ids, + # time-varying part active_inames=(), entered_inames=frozenset(), + enclosing_subkernel_inames=(), schedule=(), unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), + within_subkernel=kernel.state != kernel_state.SCHEDULED, + may_schedule_global_barriers=True, + + preschedule=preschedule, # ilp and vec are not parallel for the purposes of the scheduler parallel_inames=parallel_inames - ilp_inames - vec_inames, @@ -1638,18 +1881,15 @@ def generate_loop_schedules(kernel, debug_args={}): gsize, lsize = kernel.get_grid_size_upper_bounds() - if gsize or lsize: + if (gsize or lsize): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="local", verify_only=False) - logger.info("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( @@ -1658,7 +1898,12 @@ def generate_loop_schedules(kernel, debug_args={}): from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - new_kernel = map_schedule_onto_host_or_device(new_kernel) + if kernel.state != kernel_state.SCHEDULED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + from loopy.schedule.tools import add_extra_args_to_schedule + new_kernel = add_extra_args_to_schedule(new_kernel) yield new_kernel debug.start() diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index ca782a3d8ca85ea6250f7c9317ca0947db28d5e8..1a0789c2f61e21e4a0371e2a73195c9771245527 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -23,14 +23,13 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.kernel.data import TemporaryVariable, temp_var_scope -from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem, - EnterLoop, LeaveLoop, ReturnFromKernel, - RunInstruction) -from pytools import Record, memoize_method +from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop, + ReturnFromKernel, RunInstruction) +from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): + # FIXME: Should be idempotent. from loopy.kernel import kernel_state assert kernel.state == kernel_state.SCHEDULED @@ -53,659 +52,14 @@ def map_schedule_onto_host_or_device(kernel): kernel = map_schedule_onto_host_or_device_impl( kernel, device_prog_name_gen) - return restore_and_save_temporaries( - add_extra_args_to_schedule(kernel)) - - -# {{{ Schedule / instruction utilities - -def get_block_boundaries(schedule): - """ - Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. - """ - block_bounds = {} - active_blocks = [] - for idx, sched_item in enumerate(schedule): - if isinstance(sched_item, BeginBlockItem): - active_blocks.append(idx) - elif isinstance(sched_item, EndBlockItem): - start = active_blocks.pop() - block_bounds[start] = idx - block_bounds[idx] = start - return block_bounds - - -def get_hw_inames(kernel, insn): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) - - -def get_common_hw_inames(kernel, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection( - *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids)) - - -def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): - from loopy.kernel.data import HardwareParallelTag - new_schedule = [] - - for item in kernel.schedule: - if isinstance(item, (EnterLoop, LeaveLoop)): - tag = kernel.iname_to_tag.get(item.iname) - if isinstance(tag, HardwareParallelTag): - continue - new_schedule.append(item) - - return kernel.copy(schedule=new_schedule) - -# }}} - - -# {{{ Use / def utilities - -def filter_out_subscripts(exprs): - """ - Remove subscripts from expressions in `exprs`. - """ - result = set() - from pymbolic.primitives import Subscript - for expr in exprs: - if isinstance(expr, Subscript): - expr = expr.aggregate - result.add(expr) - return result - - -def filter_items_by_varname(pred, kernel, items): - """ - Keep only the values in `items` whose variable names satisfy `pred`. - """ - from pymbolic.primitives import Subscript, Variable - result = set() - for item in items: - base = item - if isinstance(base, Subscript): - base = base.aggregate - if isinstance(base, Variable): - base = base.name - if pred(kernel, base): - result.add(item) - return result - - -from functools import partial - -filter_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables) - -filter_scalar_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables and - len(kernel.temporary_variables[name].shape) == 0) - - -def get_use_set(insn, include_subscripts=True): - """ - Return the use-set of the instruction, for liveness analysis. - """ - result = insn.read_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_def_set(insn, include_subscripts=True): - """ - Return the def-set of the instruction, for liveness analysis. - """ - result = insn.write_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_def_and_use_lists_for_all_temporaries(kernel): - """ - Return a pair `def_lists`, `use_lists` which map temporary variable - names to lists of instructions where they are defined or used. - """ - def_lists = dict((t, []) for t in kernel.temporary_variables) - use_lists = dict((t, []) for t in kernel.temporary_variables) - - for insn in kernel.instructions: - assignees = get_def_set(insn, include_subscripts=False) - dependencies = get_use_set(insn, include_subscripts=False) - - from pymbolic.primitives import Variable - - for assignee in assignees: - if isinstance(assignee, Variable): - assignee = assignee.name - if assignee in kernel.temporary_variables: - def_lists[assignee].append(insn.id) - - for dep in dependencies: - if isinstance(dep, Variable): - dep = dep.name - if dep in kernel.temporary_variables: - use_lists[dep].append(insn.id) - - return def_lists, use_lists - - -def get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx, end_idx): - defs = set() - uses = set() - - for idx in range(start_idx, end_idx + 1): - sched_item = schedule[idx] - if isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] - defs.update( - filter_temporaries( - kernel, get_def_set(insn))) - uses.update( - filter_temporaries( - kernel, get_use_set(insn))) - - return defs, uses - -# }}} - - -# {{{ Liveness analysis - -def compute_live_temporaries(kernel, schedule): - """ - Compute live-in and live-out sets for temporary variables. - """ - live_in = [set() for i in range(len(schedule) + 1)] - live_out = [set() for i in range(len(schedule))] - - id_to_insn = kernel.id_to_insn - block_bounds = get_block_boundaries(schedule) - - # {{{ Liveness analysis implementation - - def compute_subrange_liveness(start_idx, end_idx): - idx = end_idx - while start_idx <= idx: - sched_item = schedule[idx] - if isinstance(sched_item, LeaveLoop): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - prev_live_in = live_in[start].copy() - live_in[start] = live_out[start] = live_in[start + 1] - # Propagate live values through the loop. - if live_in[start] != prev_live_in: - live_out[idx] |= live_in[start] - live_in[idx] = live_out[idx] - compute_subrange_liveness(start + 1, idx - 1) - idx = start - 1 - - elif isinstance(sched_item, ReturnFromKernel): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - live_in[start] = live_out[start] = live_in[start + 1] - idx = start - 1 - - elif isinstance(sched_item, RunInstruction): - live_out[idx] = live_in[idx + 1] - insn = id_to_insn[sched_item.insn_id] - defs = filter_scalar_temporaries(kernel, - get_def_set(insn, include_subscripts=False)) - uses = filter_temporaries(kernel, - get_use_set(insn, include_subscripts=False)) - live_in[idx] = (live_out[idx] - defs) | uses - idx -= 1 - - elif isinstance(sched_item, Barrier): - live_in[idx] = live_out[idx] = live_in[idx + 1] - idx -= 1 - else: - raise LoopyError("unexpected type of schedule item: %s" - % type(sched_item).__name__) - - # }}} - - # Compute live variables - compute_subrange_liveness(0, len(schedule) - 1) - live_in = live_in[:-1] - - if 0: - print(kernel) - print("Live-in values:") - for i, li in enumerate(live_in): - print("{}: {}".format(i, ", ".join(li))) - print("Live-out values:") - for i, lo in enumerate(live_out): - print("{}: {}".format(i, ", ".join(lo))) - - # Strip off subscripts. - live_in = [filter_out_subscripts(li) for li in live_in] - live_out = [filter_out_subscripts(lo) for lo in live_out] - - return live_in, live_out - -# }}} - - -# {{{ Temporary promotion - -class PromotedTemporary(Record): - """ - .. attribute:: name - - The name of the new temporary. - - .. attribute:: orig_temporary - - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. - - .. attribute:: shape_prefix - - A list of expressions, to be added in front of the shape - of the promoted temporary value - """ - - @memoize_method - def as_variable(self): - temporary = self.orig_temporary - return TemporaryVariable( - name=self.name, - dtype=temporary.dtype, - scope=temp_var_scope.GLOBAL, - shape=self.new_shape) - - @property - def new_shape(self): - return self.shape_prefix + self.orig_temporary.shape - - -def determine_temporaries_to_promote(kernel, temporaries, name_gen): - """ - For each temporary in the passed list of temporaries, construct a - :class:`PromotedTemporary` which describes how the temporary should - get promoted into global storage. - - :returns: A :class:`dict` mapping temporary names from `temporaries` to - :class:`PromotedTemporary` objects - """ - new_temporaries = {} - - def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) - - from loopy.kernel.data import LocalIndexTag - - for temporary in temporaries: - temporary = kernel.temporary_variables[temporary] - if temporary.scope == temp_var_scope.GLOBAL: - # Nothing to be done for global temporaries (I hope) - continue - - assert temporary.base_storage is None, \ - "Cannot promote temporaries with base_storage to global" - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = get_common_hw_inames(kernel, - def_lists[temporary.name] + use_lists[temporary.name]) - - # This takes advantage of the fact that g < l in the alphabet :) - hw_inames = sorted(hw_inames, - key=lambda iname: str(kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - shape_prefix = [] - - backing_hw_inames = [] - for iname in hw_inames: - tag = kernel.iname_to_tag[iname] - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - shape_prefix.append( - aff_to_expr( - static_max_of_pw_aff( - kernel.get_iname_bounds(iname).size, False))) - - backing_temporary = PromotedTemporary( - name=name_gen(temporary.name), - orig_temporary=temporary, - shape_prefix=tuple(shape_prefix), - hw_inames=backing_hw_inames) - new_temporaries[temporary.name] = backing_temporary - - return new_temporaries - -# }}} - - -# {{{ Domain augmentation - -def augment_domain_for_temporary_promotion( - kernel, domain, promoted_temporary, mode, name_gen): - """ - Add new axes to the domain corresponding to the dimensions of - `promoted_temporary`. - """ - import islpy as isl - - orig_temporary = promoted_temporary.orig_temporary - orig_dim = domain.dim(isl.dim_type.set) - dims_to_insert = len(orig_temporary.shape) - - iname_to_tag = {} - - # Add dimension-dependent inames. - dim_inames = [] - - domain = domain.add(isl.dim_type.set, dims_to_insert) - for t_idx in range(len(orig_temporary.shape)): - new_iname = name_gen("{name}_{mode}_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - domain = domain.set_dim_name( - isl.dim_type.set, orig_dim + t_idx, new_iname) - if orig_temporary.is_local: - # If the temporary is has local scope, then loads / stores can be - # done in parallel. - from loopy.kernel.data import AutoFitLocalIndexTag - iname_to_tag[new_iname] = AutoFitLocalIndexTag() - - dim_inames.append(new_iname) - - # Add size information. - aff = isl.affs_from_space(domain.space) - domain &= aff[0].le_set(aff[new_iname]) - size = orig_temporary.shape[t_idx] - from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) - - hw_inames = [] - - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): - new_iname = name_gen("{name}_{mode}_hw_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname] - - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) - - # The operations on the domain above return a Set object, but the - # underlying domain should be expressible as a single BasicSet. - domain_list = domain.get_basic_set_list() - assert domain_list.n_basic_set() == 1 - domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames, iname_to_tag - -# }}} - - -def restore_and_save_temporaries(kernel): - """ - Add code that loads / spills the temporaries in the kernel which are - live across sub-kernel calls. - """ - # Compute live temporaries. - live_in, live_out = compute_live_temporaries(kernel, kernel.schedule) - - # Create kernel variables based on live temporaries. - inter_kernel_temporaries = set() - - call_count = 0 - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) - call_count += 1 - - if call_count == 1: - # Single call corresponds to a kernel which has not been split - - # no need for restores / spills of temporaries. - return kernel - - name_gen = kernel.get_var_name_generator() - new_temporaries = determine_temporaries_to_promote( - kernel, inter_kernel_temporaries, name_gen) - - # {{{ Insert loads and spills of new temporaries - - new_schedule = [] - new_instructions = [] - new_iname_to_tag = {} - - idx = 0 - schedule = kernel.schedule - while idx < len(schedule): - sched_item = schedule[idx] - - if not isinstance(sched_item, CallKernel): - new_schedule.append(sched_item) - idx += 1 - continue - - subkernel_prolog = [] - subkernel_epilog = [] - subkernel_schedule = [] - - start_idx = idx - idx += 1 - while not isinstance(schedule[idx], ReturnFromKernel): - subkernel_schedule.append(schedule[idx]) - idx += 1 - - subkernel_defs, subkernel_uses = \ - get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx + 1, idx - 1) - - from loopy.kernel.data import temp_var_scope - # Filter out temporaries that are global. - subkernel_globals = set( - tval for tval in subkernel_defs | subkernel_uses - if kernel.temporary_variables[tval].scope == temp_var_scope.GLOBAL) - - tvals_to_spill = (subkernel_defs - subkernel_globals) & live_out[idx] - # Need to load tvals_to_spill, to avoid overwriting entries that the - # code doesn't touch when doing the spill. - tvals_to_load = ((subkernel_uses - subkernel_globals) - | tvals_to_spill) & live_in[start_idx] - - # Add new arguments. - sched_item = sched_item.copy( - extra_args=sched_item.extra_args - + sorted(new_temporaries[tv].name - for tv in tvals_to_load | tvals_to_spill)) - - # {{{ Add all the loads and spills. - - def insert_loads_or_spills(tvals, mode): - assert mode in ["load", "spill"] - local_temporaries = set() - - code_block = \ - subkernel_prolog if mode == "load" else subkernel_epilog - - new_kernel = kernel - - for tval in tvals: - from loopy.kernel.tools import DomainChanger - tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(new_kernel, - frozenset(sched_item.extra_inames + tval_hw_inames)) - domain = dchg.domain - - domain, hw_inames, dim_inames, itt = \ - augment_domain_for_temporary_promotion( - new_kernel, domain, new_temporaries[tval], mode, - name_gen) - new_iname_to_tag.update(itt) - - new_kernel = dchg.get_kernel_with(domain) - - # Add the load / spill instruction. - insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode)) - - def subscript_or_var(agg, subscript): - from pymbolic.primitives import Subscript, Variable - if len(subscript) == 0: - return Variable(agg) - else: - return Subscript( - Variable(agg), - tuple(map(Variable, subscript))) - - args = ( - subscript_or_var( - tval, dim_inames), - subscript_or_var( - new_temporaries[tval].name, hw_inames + dim_inames)) - - if mode == "spill": - args = reversed(args) - - from loopy.kernel.data import Assignment - new_insn = Assignment(*args, id=insn_id, - within_inames=frozenset(hw_inames + dim_inames), - within_inames_is_final=True) - - new_instructions.append(new_insn) - - loop_begin = [EnterLoop(iname=iname) for iname in dim_inames] - loop_end = list(reversed([ - LeaveLoop(iname=iname) for iname in dim_inames])) - code_block.extend( - loop_begin + - [RunInstruction(insn_id=insn_id)] + - loop_end) - if new_temporaries[tval].orig_temporary.is_local: - local_temporaries.add(new_temporaries[tval].name) - - # After loading / before spilling local temporaries, we need to - # insert a barrier. - if local_temporaries: - if mode == "load": - subkernel_prolog.append( - Barrier(kind="local", - comment="for loads of {0}".format( - ", ".join(sorted(local_temporaries))))) - else: - subkernel_epilog.insert(0, - Barrier(kind="local", - comment="for spills of {0}".format( - ", ".join(sorted(local_temporaries))))) - return new_kernel - - kernel = insert_loads_or_spills(tvals_to_load, "load") - kernel = insert_loads_or_spills(tvals_to_spill, "spill") - - # }}} - - new_schedule.extend( - [sched_item] + - subkernel_prolog + - subkernel_schedule + - subkernel_epilog + - # ReturnFromKernel - [schedule[idx]]) - - # ReturnFromKernel - idx += 1 - - # }}} - - new_iname_to_tag.update(kernel.iname_to_tag) - updated_temporary_variables = dict( - (t.name, t.as_variable()) for t in new_temporaries.values()) - updated_temporary_variables.update(kernel.temporary_variables) - - kernel = kernel.copy( - iname_to_tag=new_iname_to_tag, - temporary_variables=updated_temporary_variables, - instructions=kernel.instructions + new_instructions, - schedule=new_schedule - ) - - from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) - - # Once assign_automatic_axes() does its job, loops in the schedule - # for newly hardware-tagged inames are no longer necessary (and in - # fact illegal), so remove them. - kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) - return kernel -def add_extra_args_to_schedule(kernel): - """ - Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` - instructions in the schedule with global temporaries. - """ - new_schedule = [] - - block_bounds = get_block_boundaries(kernel.schedule) - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - defs, uses = get_temporaries_defined_and_used_in_subrange( - kernel, kernel.schedule, idx + 1, block_bounds[idx] - 1) - # Filter out temporaries that are global. - extra_args = (tv for tv in defs | uses if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None) - new_schedule.append(sched_item.copy(extra_args=sorted(extra_args))) - else: - new_schedule.append(sched_item) - - return kernel.copy(schedule=new_schedule) - - def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): schedule = kernel.schedule loop_bounds = get_block_boundaries(schedule) - # {{{ Inner mapper function + # {{{ inner mapper function dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) dummy_return = ReturnFromKernel(kernel_name="") @@ -760,6 +114,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) + new_schedule.append(sched_item) current_chunk = [] else: current_chunk.append(sched_item) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..5de677e72708be844a5276b3d40ace8b1dad9da0 --- /dev/null +++ b/loopy/schedule/tools.py @@ -0,0 +1,191 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.kernel.data import temp_var_scope +from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, + RunInstruction, Barrier) + +from pytools import memoize_method + + +# {{{ block boundary finder + +def get_block_boundaries(schedule): + """ + Return a dictionary mapping indices of + :class:`loopy.schedule.BlockBeginItem`s to + :class:`loopy.schedule.BlockEndItem`s and vice versa. + """ + block_bounds = {} + active_blocks = [] + for idx, sched_item in enumerate(schedule): + if isinstance(sched_item, BeginBlockItem): + active_blocks.append(idx) + elif isinstance(sched_item, EndBlockItem): + start = active_blocks.pop() + block_bounds[start] = idx + block_bounds[idx] = start + return block_bounds + +# }}} + + +# {{{ instruction query utility + +class InstructionQuery(object): + + def __init__(self, kernel): + self.kernel = kernel + block_bounds = get_block_boundaries(kernel.schedule) + subkernel_slices = {} + from six import iteritems + for start, end in iteritems(block_bounds): + sched_item = kernel.schedule[start] + if isinstance(sched_item, CallKernel): + subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) + self.subkernel_slices = subkernel_slices + + @memoize_method + def subkernels(self): + return frozenset(self.subkernel_slices.keys()) + + @memoize_method + def insns_reading_or_writing(self, var): + return frozenset(insn.id for insn in self.kernel.instructions + if var in insn.read_dependency_names() + or var in insn.assignee_var_names()) + + @memoize_method + def insns_in_subkernel(self, subkernel): + return frozenset(sched_item.insn_id for sched_item + in self.kernel.schedule[self.subkernel_slices[subkernel]] + if isinstance(sched_item, RunInstruction)) + + @memoize_method + def temporaries_read_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].read_dependency_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_written_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].assignee_var_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_read_or_written_in_subkernel(self, subkernel): + return ( + self.temporaries_read_in_subkernel(subkernel) | + self.temporaries_written_in_subkernel(subkernel)) + + @memoize_method + def inames_in_subkernel(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + return frozenset(self.kernel.schedule[subkernel_start].extra_inames) + + @memoize_method + def pre_and_post_barriers(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + subkernel_end = self.subkernel_slices[subkernel].stop + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + @memoize_method + def hw_inames(self, insn_id): + """ + Return the inames that insn runs in and that are tagged as hardware + parallel. + """ + from loopy.kernel.data import HardwareParallelTag + return set(iname for iname in self.kernel.insn_inames(insn_id) + if isinstance(self.kernel.iname_to_tag.get(iname), + HardwareParallelTag)) + + @memoize_method + def common_hw_inames(self, insn_ids): + """ + Return the common set of hardware parallel tagged inames among + the list of instructions. + """ + # Get the list of hardware inames in which the temporary is defined. + if len(insn_ids) == 0: + return set() + return set.intersection(*(self.hw_inames(id) for id in insn_ids)) + +# }}} + + +# {{{ add extra args to schedule + +def add_extra_args_to_schedule(kernel): + """ + Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` + instructions in the schedule with global temporaries. + """ + new_schedule = [] + + insn_query = InstructionQuery(kernel) + + for sched_item in kernel.schedule: + if isinstance(sched_item, CallKernel): + subrange_temporaries = (insn_query + .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + more_args = set(tv + for tv in subrange_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( + extra_args=sched_item.extra_args + sorted(more_args))) + else: + new_schedule.append(sched_item) + + return kernel.copy(schedule=new_schedule) + +# }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index 47abfe53a4bfe8598cd09425b5baa81f13525c37..2ec5eb0d4d5e32dbd9eb201ab718078a6b36f7d8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -25,6 +25,7 @@ THE SOFTWARE. import six import loopy as lp +import numpy as np import warnings from islpy import dim_type import islpy as isl @@ -39,13 +40,13 @@ __doc__ = """ .. currentmodule:: loopy -.. autofunction:: get_op_poly +.. autoclass:: ToCountMap +.. autoclass:: Op +.. autoclass:: MemAccess -.. autofunction:: get_gmem_access_poly - -.. autofunction:: sum_mem_access_to_bytes - -.. autofunction:: get_synchronization_poly +.. autofunction:: get_op_map +.. autofunction:: get_mem_access_map +.. autofunction:: get_synchronization_map .. autofunction:: gather_access_footprints .. autofunction:: gather_access_footprint_bytes @@ -55,18 +56,27 @@ __doc__ = """ # {{{ ToCountMap -class ToCountMap: - """Maps any type of key to an arithmetic type.""" +class ToCountMap(object): + """Maps any type of key to an arithmetic type. + + .. automethod:: filter_by + .. automethod:: filter_by_func + .. automethod:: group_by + .. automethod:: to_bytes + .. automethod:: sum + .. automethod:: eval_and_sum + + """ def __init__(self, init_dict=None): if init_dict is None: init_dict = {} - self.dict = init_dict + self.count_map = init_dict def __add__(self, other): - result = self.dict.copy() - for k, v in six.iteritems(other.dict): - result[k] = self.dict.get(k, 0) + v + result = self.count_map.copy() + for k, v in six.iteritems(other.count_map): + result[k] = self.count_map.get(k, 0) + v return ToCountMap(result) def __radd__(self, other): @@ -80,8 +90,8 @@ class ToCountMap: def __mul__(self, other): if isinstance(other, isl.PwQPolynomial): return ToCountMap(dict( - (index, self.dict[index]*other) - for index in self.dict.keys())) + (index, self.count_map[index]*other) + for index in self.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {0} {1}." @@ -91,12 +101,262 @@ class ToCountMap: def __getitem__(self, index): try: - return self.dict[index] + return self.count_map[index] except KeyError: return isl.PwQPolynomial('{ 0 }') + def __setitem__(self, index, value): + self.count_map[index] = value + def __repr__(self): - return repr(self.dict) + return repr(self.count_map) + + def __len__(self): + return len(self.count_map) + + def items(self): + return self.count_map.items() + + def keys(self): + return self.count_map.keys() + + def pop(self, item): + return self.count_map.pop(item) + + def copy(self): + return ToCountMap(dict(self.count_map)) + + def filter_by(self, **kwargs): + """Remove items without specified key fields. + + :parameter \*\*kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. + + :return: A :class:`ToCountMap` containing the subset of the items in + the original :class:`ToCountMap` that match the field values + passed. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + from loopy.types import to_loopy_type + if 'dtype' in kwargs.keys(): + kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] + + # for each item in self.count_map + for self_key, self_val in self.items(): + try: + # check to see if key attribute values match all filters + for arg_field, allowable_vals in kwargs.items(): + attr_val = getattr(self_key, arg_field) + # see if the value is in the filter list + if attr_val not in allowable_vals: + break + else: # loop terminated without break or error + result_map[self_key] = self_val + except(AttributeError): + # the field passed is not a field of this key + continue + + return result_map + + def filter_by_func(self, func): + """Keep items that pass a test. + + :parameter func: A function that takes a map key a parameter and + returns a :class:`bool`. + + :return: A :class:`ToCountMap` containing the subset of the items in + the original :class:`ToCountMap` for which func(key) is true. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + def filter_func(key): + return key.stride > 1 and key.stride <= 4: + + filtered_map = mem_map.filter_by_func(filter_func) + tot = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # for each item in self.count_map, call func on the key + for self_key, self_val in self.items(): + if func(self_key): + result_map[self_key] = self_val + + return result_map + + def group_by(self, *args): + """Group map items together, distinguishing by only the key fields + passed in args. + + :parameter \*args: Zero or more :class:`str` fields of map keys. + + :return: A :class:`ToCountMap` containing the same total counts + grouped together by new keys that only contain the fields + specified in the arguments passed. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_map(knl) + grouped_map = mem_map.group_by('mtype', 'dtype', 'direction') + + f32_global_ld = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_global_st = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + f32_local_ld = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_local_st = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + + op_map = get_op_map(knl) + ops_dtype = op_map.group_by('dtype') + + f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params) + f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) + i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # make sure all item keys have same type + if self.count_map: + key_type = type(list(self.keys())[0]) + if not all(isinstance(x, key_type) for x in self.keys()): + raise ValueError("ToCountMap: group_by() function may only " + "be used on ToCountMaps with uniform keys") + else: + return result_map + + # for each item in self.count_map + for self_key, self_val in self.items(): + new_key = key_type() + + # set all specified fields + for field in args: + setattr(new_key, field, getattr(self_key, field)) + + if new_key in result_map.keys(): + result_map[new_key] += self_val + else: + result_map[new_key] = self_val + + return result_map + + def to_bytes(self): + """Convert counts to bytes using data type in map key. + + :return: A :class:`ToCountMap` mapping each original key to a + :class:`islpy.PwQPolynomial` with counts in bytes rather than + instances. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + bytes_map = get_mem_access_map(knl).to_bytes() + params = {'n': 512, 'm': 256, 'l': 128} + + s1_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['load']).eval_and_sum(params) + s2_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['load']).eval_and_sum(params) + s1_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['store']).eval_and_sum(params) + s2_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['store']).eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result = self.copy() + + for key, val in self.items(): + bytes_processed = int(key.dtype.itemsize) * val + result[key] = bytes_processed + + return result + + + def sum(self): + """Add all counts in ToCountMap. + + :return: A :class:`islpy.PwQPolynomial` containing the sum of counts. + + """ + total = isl.PwQPolynomial('{ 0 }') + for k, v in self.items(): + if not isinstance(v, isl.PwQPolynomial): + raise ValueError("ToCountMap: sum() encountered type {0} but " + "may only be used on PwQPolynomials." + .format(type(v))) + total += v + return total + + + def eval_and_sum(self, params): + """Add all counts in :class:`ToCountMap` and evaluate with provided + parameter dict. + + :return: An :class:`int` containing the sum of all counts in the + :class:`ToCountMap` evaluated with the parameters provided. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + return self.sum().eval_with_dict(params) # }}} @@ -108,13 +368,150 @@ def stringify_stats_mapping(m): return result +class Op(object): + """An arithmetic operation. + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. + + .. attribute:: name + + A :class:`str` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + + """ + + def __init__(self, dtype=None, name=None): + self.name = name + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) + + def __eq__(self, other): + return isinstance(other, Op) and ( + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.name is None or other.name is None or + self.name == other.name)) + + def __hash__(self): + return hash(str(self)) + + def __str__(self): + if self.dtype is None: + dtype = 'None' + else: + dtype = str(self.dtype) + if self.name is None: + name = 'None' + else: + name = self.name + return "Op("+dtype+", "+name+")" + + +class MemAccess(object): + """A memory access. + + .. attribute:: mtype + + A :class:`str` that specifies the memory type accessed as **global** + or **local** + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. + + .. attribute:: stride + + An :class:`int` that specifies stride of the memory access. A stride of 0 + indicates a uniform access (i.e. all threads access the same item). + + .. attribute:: direction + + A :class:`str` that specifies the direction of memory access as + **load** or **store**. + + .. attribute:: variable + + A :class:`str` that specifies the variable name of the data + accessed. + + """ + + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): + self.mtype = mtype + self.stride = stride + self.direction = direction + self.variable = variable + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) + + #TODO currently giving all lmem access stride=None + if (mtype == 'local') and (stride is not None): + raise NotImplementedError("MemAccess: stride must be None when " + "mtype is 'local'") + + #TODO currently giving all lmem access variable=None + if (mtype == 'local') and (variable is not None): + raise NotImplementedError("MemAccess: variable must be None when " + "mtype is 'local'") + + def __eq__(self, other): + return isinstance(other, MemAccess) and ( + (self.mtype is None or other.mtype is None or + self.mtype == other.mtype) and + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.stride is None or other.stride is None or + self.stride == other.stride) and + (self.direction is None or other.direction is None or + self.direction == other.direction) and + (self.variable is None or other.variable is None or + self.variable == other.variable)) + + def __hash__(self): + return hash(str(self)) + + def __str__(self): + if self.mtype is None: + mtype = 'None' + else: + mtype = self.mtype + if self.dtype is None: + dtype = 'None' + else: + dtype = str(self.dtype) + if self.stride is None: + stride = 'None' + else: + stride = str(self.stride) + if self.direction is None: + direction = 'None' + else: + direction = self.direction + if self.variable is None: + variable = 'None' + else: + variable = self.variable + return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \ + +variable+")" + + # {{{ ExpressionOpCounter class ExpressionOpCounter(CombineMapper): def __init__(self, knl): self.knl = knl - from loopy.expression import TypeInferenceMapper + from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) def combine(self, values): @@ -126,41 +523,33 @@ class ExpressionOpCounter(CombineMapper): map_tagged_variable = map_constant map_variable = map_constant - #def map_wildcard(self, expr): - # return 0,0 - - #def map_function_symbol(self, expr): - # return 0,0 - def map_call(self, expr): return ToCountMap( - {(self.type_inf(expr), 'func:'+str(expr.function)): 1} + {Op(dtype=self.type_inf(expr), + name='func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) - # def map_call_with_kwargs(self, expr): # implemented in CombineMapper - - def map_subscript(self, expr): # implemented in CombineMapper + def map_subscript(self, expr): return self.rec(expr.index) - # def map_lookup(self, expr): # implemented in CombineMapper - def map_sum(self, expr): assert expr.children return ToCountMap( - {(self.type_inf(expr), 'add'): len(expr.children)-1} + {Op(dtype=self.type_inf(expr), + name='add'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1}) + return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({(self.type_inf(expr), 'mul'): -1}) + ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({(self.type_inf(expr), 'div'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -168,54 +557,47 @@ class ExpressionOpCounter(CombineMapper): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap( - {(self.type_inf(expr), 'bw'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): + len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or map_bitwise_and = map_bitwise_or - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_logical_not(self, expr): - return self.rec(expr.child) - - def map_logical_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_logical_and = map_logical_or - def map_if(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_if_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_ifpos_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap( - {(self.type_inf(expr), 'maxmin'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'): + len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_max = map_min @@ -225,11 +607,13 @@ class ExpressionOpCounter(CombineMapper): "map_common_subexpression not implemented.") def map_substitution(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered substitution, " + raise NotImplementedError("ExpressionOpCounter encountered " + "substitution, " "map_substitution not implemented.") def map_derivative(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered derivative, " + raise NotImplementedError("ExpressionOpCounter encountered " + "derivative, " "map_derivative not implemented.") def map_slice(self, expr): @@ -239,13 +623,90 @@ class ExpressionOpCounter(CombineMapper): # }}} +# {{{ LocalSubscriptCounter + +class LocalSubscriptCounter(CombineMapper): + + def __init__(self, knl): + self.knl = knl + from loopy.type_inference import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + map_tagged_variable = map_constant + map_variable = map_constant + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_subscript(self, expr): + sub_map = ToCountMap() + name = expr.aggregate.name # name of array + if name in self.knl.temporary_variables: + array = self.knl.temporary_variables[name] + if array.is_local: + sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 + return sub_map + self.rec(expr.index) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + def map_if(self, expr): + warn_with_kernel(self.knl, "summing_if_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_if_positive(self, expr): + warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_common_subexpression(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "common_subexpression, " + "map_common_subexpression not implemented.") + + def map_substitution(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "substitution, " + "map_substitution not implemented.") + + def map_derivative(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "derivative, " + "map_derivative not implemented.") + + def map_slice(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered slice, " + "map_slice not implemented.") + +# }}} + + # {{{ GlobalSubscriptCounter class GlobalSubscriptCounter(CombineMapper): def __init__(self, knl): self.knl = knl - from loopy.expression import TypeInferenceMapper + from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) def combine(self, values): @@ -278,33 +739,52 @@ class GlobalSubscriptCounter(CombineMapper): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, GroupIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - local_id0 = None + + # find min tag axis + import sys + min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: - # find local id0 tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - if tag.axis == 0: - local_id0 = iname - break # there will be only one local_id0 + if tag.axis < min_tag_axis: + min_tag_axis = tag.axis if not local_id_found: # count as uniform access - return ToCountMap( - {(self.type_inf(expr), 'uniform'): 1} - ) + self.rec(expr.index) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} + ) + self.rec(expr.index) + + if min_tag_axis != 0: + warn_with_kernel(self.knl, "unknown_gmem_stride", + "GlobalSubscriptCounter: Memory access minimum " + "tag axis %d != 0, stride unknown, using " + "sys.maxsize." % (min_tag_axis)) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), + stride=sys.maxsize, variable=name): 1} + ) + self.rec(expr.index) + + # get local_id associated with minimum tag axis + min_lid = None + for iname in my_inames: + tag = self.knl.iname_to_tag.get(iname) + if isinstance(tag, LocalIndexTag): + if tag.axis == min_tag_axis: + min_lid = iname + break # there will be only one min local_id - if local_id0 is None: - # only non-zero local id(s) found, assume non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + # found local_id associated with minimum tag axis - # check coefficient of local_id0 for each axis + total_stride = 0 + # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector + from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): @@ -312,36 +792,22 @@ class GlobalSubscriptCounter(CombineMapper): coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx)) # check if he contains the lid 0 guy try: - coeff_id0 = coeffs[Variable(local_id0)] + coeff_min_lid = coeffs[Variable(min_lid)] except KeyError: - # does not contain local_id0 + # does not contain min_lid continue - - if coeff_id0 != 1: - # non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # coefficient is 1, now determine if stride is 1 - from loopy.kernel.array import FixedStrideArrayDimTag + # found coefficient of min_lid + # now determine stride if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue - if stride != 1: - # non-consecutive - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + total_stride += stride*coeff_min_lid - # else, stride == 1, continue since another idx could contain id0 - - # loop finished without returning, stride==1 for every instance of local_id0 - return ToCountMap( - {(self.type_inf(expr), 'consecutive'): 1} - ) + self.rec(expr.index) + return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), + stride=total_stride, variable=name): 1} + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -351,48 +817,19 @@ class GlobalSubscriptCounter(CombineMapper): map_product = map_sum - def map_quotient(self, expr, *args): - return self.rec(expr.numerator) + self.rec(expr.denominator) - - map_floor_div = map_quotient - map_remainder = map_quotient - - def map_power(self, expr): - return self.rec(expr.base) + self.rec(expr.exponent) - - def map_left_shift(self, expr): - return self.rec(expr.shiftee)+self.rec(expr.shift) - - map_right_shift = map_left_shift - - def map_bitwise_not(self, expr): - return self.rec(expr.child) - - def map_bitwise_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_bitwise_xor = map_bitwise_or - map_bitwise_and = map_bitwise_or - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - map_logical_not = map_bitwise_not - map_logical_or = map_bitwise_or - map_logical_and = map_logical_or - def map_if(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + warn_with_kernel(self.knl, "summing_if_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) - - map_min = map_bitwise_or - map_max = map_min + warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_common_subexpression(self, expr): raise NotImplementedError("GlobalSubscriptCounter encountered " @@ -524,7 +961,8 @@ def count(kernel, set): # {{{ rebuild check domain - zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space)) + zero = isl.Aff.zero_on_domain( + isl.LocalSpace.from_space(bset.space)) iname = isl.PwAff.from_aff( zero.set_coefficient_val(isl.dim_type.in_, i, 1)) dmin_matched = dmin.insert_dims( @@ -584,31 +1022,44 @@ def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. + get_op_poly is deprecated. Use get_op_map instead. + + """ + warn_with_kernel(knl, "depricated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") + return get_op_map(knl, numpy_types) + +# }}} + + +def get_op_map(knl, numpy_types=True): + + """Count the number of operations in a loopy kernel. + :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :return: A mapping of **{(** *type* **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}**. + :parameter numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** + :class:`islpy.PwQPolynomial` **}**. - - The string specifies the operation type as - *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` *parameter inames*). + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: # (first create loopy kernel and specify array data types) - poly = get_op_poly(knl) + op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -618,88 +1069,128 @@ def get_op_poly(knl, numpy_types=True): knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - op_poly = ToCountMap() + op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_poly = op_poly + ops*count(knl, domain) - result = op_poly.dict + op_map = op_map + ops*count(knl, domain) if numpy_types: - result = dict( - ((dtype.numpy_dtype, kind), count) - for (dtype, kind), count in six.iteritems(result)) + op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + count) + for op, count in six.iteritems(op_map.count_map)) - return result -# }}} + return op_map -def sum_ops_to_dtypes(op_poly_dict): - result = {} - for (dtype, kind), v in op_poly_dict.items(): - new_key = dtype - if new_key in result: - result[new_key] += v - else: - result[new_key] = v +#TODO test deprecated functions? +def get_lmem_access_poly(knl): + """Count the number of local memory accesses in a loopy kernel. - return result + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + + """ + warn_with_kernel(knl, "depricated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") + return get_mem_access_map(knl).filter_by(mtype=['local']) + + +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "depricated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) # {{{ get_gmem_access_poly -def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts +def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be - counted. + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. - :return: A mapping of **{(** *type* **,** :class:`string` **,** - :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. + """ + warn_with_kernel(knl, "depricated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + +# }}} - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. - - The first string in the map key specifies the global memory - access type as - *consecutive*, *nonconsecutive*, or *uniform*. +def get_mem_access_map(knl, numpy_types=True): + """Count the number of memory accesses in a loopy kernel. + + :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. - - The second string in the map key specifies the global memory - access type as a - *load*, or a *store*. + :parameter numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`MemAccess` specifies the characteristics of the + memory access. + + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) - subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - - f32_uncoalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_store = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict(params) + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='load', + variable='a') + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='store', + variable='a') + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='load', + variable='x') + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='store', + variable='x') + ].eval_with_dict(params) # (now use these counts to predict performance) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): @@ -712,7 +1203,8 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + isinstance( + knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) @@ -721,82 +1213,82 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_poly = ToCountMap() - subscript_counter = GlobalSubscriptCounter(knl) + subs_map = ToCountMap() + subs_counter_g = GlobalSubscriptCounter(knl) + subs_counter_l = LocalSubscriptCounter(knl) + for insn in knl.instructions: - # count subscripts, distinguishing loads and stores - subs_expr = subscript_counter(insn.expression) - subs_expr = ToCountMap(dict( - (key + ("load",), val) - for key, val in six.iteritems(subs_expr.dict))) - subs_assignee = subscript_counter(insn.assignee) - subs_assignee = ToCountMap(dict( - (key + ("store",), val) - for key, val in six.iteritems(subs_assignee.dict))) + # count subscripts + subs_expr = subs_counter_g(insn.expression) \ + + subs_counter_l(insn.expression) + + # distinguish loads and stores + for key in subs_expr.count_map: + subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, direction='load', + variable=key.variable) + ] = subs_expr.pop(key) + + subs_assignee_g = subs_counter_g(insn.assignee) + for key in subs_assignee_g.count_map: + subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, + direction='store', + variable=key.variable) + ] = subs_assignee_g.pop(key) + # for now, don't count writes to local mem insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses - for key in subs_expr.dict: - poly = ToCountMap({key: subs_expr.dict[key]}) - if key[1] == "uniform": - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + for key in subs_expr.count_map: + map = ToCountMap({key: subs_expr[key]}) + if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - for key in subs_assignee.dict: - poly = ToCountMap({key: subs_assignee.dict[key]}) - if key[1] == "uniform": - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + subs_map = subs_map + map*get_insn_count(knl, insn_inames) + #currently not counting stride of local mem access + + for key in subs_assignee_g.count_map: + map = ToCountMap({key: subs_assignee_g[key]}) + if isinstance(key.stride, int) and key.stride == 0: + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - - result = subs_poly.dict + subs_map = subs_map + map*get_insn_count(knl, insn_inames) + # for now, don't count writes to local mem if numpy_types: - result = dict( - ((dtype.numpy_dtype, kind, direction), count) - for (dtype, kind, direction), count in six.iteritems(result)) - - return result + subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable) + , count) + for mem_access, count in six.iteritems(subs_map.count_map)) - -def get_DRAM_access_poly(knl): - from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", - DeprecationWarning, stacklevel=2) - return get_gmem_access_poly(knl) - -# }}} + return subs_map -# {{{ sum_mem_access_to_bytes +# {{{ get_synchronization_poly -def sum_mem_access_to_bytes(m): - """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping +def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. - **{(** :class:`string` **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}** + get_synchronization_poly is deprecated. Use get_synchronization_map instead. - i.e., aggregate the transfer numbers for all types into a single byte count. """ - - result = {} - for (dtype, kind, direction), v in m.items(): - new_key = (kind, direction) - bytes_transferred = int(dtype.itemsize) * v - if new_key in result: - result[new_key] += bytes_transferred - else: - result[new_key] = bytes_transferred - - return result + warn_with_kernel(knl, "depricated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") + return get_synchronization_map(knl) # }}} -# {{{ get_synchronization_poly - -def get_synchronization_poly(knl): +def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. @@ -804,8 +1296,8 @@ def get_synchronization_poly(knl): :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a - :class:`islpy.PwQPolynomial` holding the number of such events - per thread. + :class:`islpy.PwQPolynomial` holding the number of events per + thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -814,9 +1306,9 @@ def get_synchronization_poly(knl): # (first create loopy kernel and specify array data types) - barrier_poly = get_barrier_poly(knl) + sync_map = get_synchronization_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - barrier_count = barrier_poly.eval_with_dict(params) + barrier_ct = sync_map['barrier_local'].eval_with_dict(params) # (now use this count to predict performance) @@ -854,8 +1346,8 @@ def get_synchronization_poly(knl): iname_list.pop() elif isinstance(sched_item, Barrier): - result = result + ToCountMap( - {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) + result = result + ToCountMap({"barrier_%s" % sched_item.kind: + get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( @@ -868,9 +1360,8 @@ def get_synchronization_poly(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - return result.dict - -# }}} + #return result.count_map #TODO is this change okay? + return result # {{{ gather_access_footprints @@ -881,7 +1372,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. - :arg ignore_uncountable: If *True*, an error will be raised for + :arg ignore_uncountable: If *False*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ @@ -905,7 +1396,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False): insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except(insn_inames, + [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) @@ -947,7 +1439,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): kernel = preprocess_kernel(kernel) result = {} - fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) + fp = gather_access_footprints(kernel, + ignore_uncountable=ignore_uncountable) for key, var_fp in fp.items(): vname, direction = key diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5b5b2477651c4026cfb4b0618481fbb8b3710728..430c651589939a1001432bd8db413cb5902b14a6 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -28,11 +28,10 @@ THE SOFTWARE. import six from six.moves import range, zip, reduce, intern -from pytools import memoize, memoize_method, Record +from pytools import memoize, memoize_method, ImmutableRecord import pytools.lex -from pymbolic.primitives import ( - Leaf, Expression, Variable, CommonSubexpression) +import pymbolic.primitives as p from pymbolic.mapper import ( CombineMapper as CombineMapperBase, @@ -83,11 +82,11 @@ class IdentityMapperMixin(object): return expr def map_reduction(self, expr, *args): - mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] + mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] for iname, new_sym_iname in zip(expr.inames, mapped_inames): - if not isinstance(new_sym_iname, Variable): + if not isinstance(new_sym_iname, p.Variable): from loopy.diagnostic import LoopyError raise LoopyError("%s did not map iname '%s' to a variable" % (type(self).__name__, iname)) @@ -253,7 +252,7 @@ class DependencyMapper(DependencyMapperBase): def map_reduction(self, expr): return (self.rec(expr.expr) - - set(Variable(iname) for iname in expr.inames)) + - set(p.Variable(iname) for iname in expr.inames)) def map_tagged_variable(self, expr): return set([expr]) @@ -303,7 +302,7 @@ class SubstitutionRuleExpander(IdentityMapper): # {{{ loopy-specific primitives -class Literal(Leaf): +class Literal(p.Leaf): """A literal to be used during code generation.""" def __init__(self, s): @@ -320,7 +319,7 @@ class Literal(Leaf): mapper_method = "map_literal" -class ArrayLiteral(Leaf): +class ArrayLiteral(p.Leaf): "An array literal." # Currently only used after loopy -> C expression translation. @@ -339,7 +338,7 @@ class ArrayLiteral(Leaf): mapper_method = "map_array_literal" -class HardwareAxisIndex(Leaf): +class HardwareAxisIndex(p.Leaf): def __init__(self, axis): self.axis = axis @@ -360,7 +359,7 @@ class LocalHardwareAxisIndex(HardwareAxisIndex): mapper_method = "map_local_hw_index" -class FunctionIdentifier(Leaf): +class FunctionIdentifier(p.Leaf): """A base class for symbols representing functions.""" init_arg_names = () @@ -371,13 +370,13 @@ class FunctionIdentifier(Leaf): mapper_method = intern("map_loopy_function_identifier") -class TypedCSE(CommonSubexpression): +class TypedCSE(p.CommonSubexpression): """A :class:`pymbolic.primitives.CommonSubexpression` annotated with a :class:`numpy.dtype`. """ def __init__(self, child, prefix=None, dtype=None): - CommonSubexpression.__init__(self, child, prefix) + super(TypedCSE, self).__init__(child, prefix) self.dtype = dtype def __getinitargs__(self): @@ -387,7 +386,7 @@ class TypedCSE(CommonSubexpression): return dict(dtype=self.dtype) -class TypeAnnotation(Expression): +class TypeAnnotation(p.Expression): def __init__(self, type, child): super(TypeAnnotation, self).__init__() self.type = type @@ -399,7 +398,7 @@ class TypeAnnotation(Expression): mapper_method = intern("map_type_annotation") -class TaggedVariable(Variable): +class TaggedVariable(p.Variable): """This is an identifier with a tag, such as 'matrix$one', where 'one' identifies this specific use of the identifier. This mechanism may then be used to address these uses--such as by prefetching only @@ -409,7 +408,7 @@ class TaggedVariable(Variable): init_arg_names = ("name", "tag") def __init__(self, name, tag): - Variable.__init__(self, name) + super(TaggedVariable, self).__init__(name) self.tag = tag def __getinitargs__(self): @@ -421,7 +420,7 @@ class TaggedVariable(Variable): mapper_method = intern("map_tagged_variable") -class Reduction(Expression): +class Reduction(p.Expression): """Represents a reduction operation on :attr:`expr` across :attr:`inames`. @@ -451,13 +450,13 @@ class Reduction(Expression): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) - elif isinstance(inames, Variable): + elif isinstance(inames, p.Variable): inames = (inames,) assert isinstance(inames, tuple) def strip_var(iname): - if isinstance(iname, Variable): + if isinstance(iname, p.Variable): iname = iname.name assert isinstance(iname, str) @@ -501,7 +500,7 @@ class Reduction(Expression): mapper_method = intern("map_reduction") -class LinearSubscript(Expression): +class LinearSubscript(p.Expression): """Represents a linear index into a multi-dimensional array, completely ignoring any multi-dimensional layout. """ @@ -521,7 +520,7 @@ class LinearSubscript(Expression): mapper_method = intern("map_linear_subscript") -class RuleArgument(Expression): +class RuleArgument(p.Expression): """Represents a (numbered) argument of a :class:`loopy.SubstitutionRule`. Only used internally in the rule-aware mappers to match subst rules independently of argument names. @@ -554,13 +553,13 @@ def get_dependencies(expr): def parse_tagged_name(expr): if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, Variable): + elif isinstance(expr, p.Variable): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) -class ExpansionState(Record): +class ExpansionState(ImmutableRecord): """ .. attribute:: kernel .. attribute:: instruction @@ -590,7 +589,7 @@ class SubstitutionRuleRenamer(IdentityMapper): self.renames = renames def map_call(self, expr): - if not isinstance(expr.function, Variable): + if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr) name, tag = parse_tagged_name(expr.function) @@ -600,7 +599,7 @@ class SubstitutionRuleRenamer(IdentityMapper): return IdentityMapper.map_call(self, expr) if tag is None: - sym = Variable(new_name) + sym = p.Variable(new_name) else: sym = TaggedVariable(new_name, tag) @@ -614,7 +613,7 @@ class SubstitutionRuleRenamer(IdentityMapper): return IdentityMapper.map_variable(self, expr) if tag is None: - return Variable(new_name) + return p.Variable(new_name) else: return TaggedVariable(new_name, tag) @@ -760,7 +759,7 @@ class RuleAwareIdentityMapper(IdentityMapper): return self.map_substitution(name, tag, (), expn_state) def map_call(self, expr, expn_state): - if not isinstance(expr.function, Variable): + if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr, expn_state) name, tag = parse_tagged_name(expr.function) @@ -803,7 +802,7 @@ class RuleAwareIdentityMapper(IdentityMapper): name, rule.arguments, result) if tag is None: - sym = Variable(new_name) + sym = p.Variable(new_name) else: sym = TaggedVariable(new_name, tag) @@ -920,7 +919,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): def _parse_reduction(self, operation, inames, red_expr, allow_simultaneous=False): - if isinstance(inames, Variable): + if isinstance(inames, p.Variable): inames = (inames,) if not isinstance(inames, (tuple)): @@ -929,7 +928,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames = [] for iname in inames: - if not isinstance(iname, Variable): + if not isinstance(iname, p.Variable): raise TypeError("iname argument to reduce() must be a symbol " "or a tuple or a tuple of symbols") @@ -941,22 +940,20 @@ class FunctionToPrimitiveMapper(IdentityMapper): def map_call(self, expr): from loopy.library.reduction import parse_reduction_op - from pymbolic.primitives import Variable - if not isinstance(expr.function, Variable): + if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr) name = expr.function.name if name == "cse": - from pymbolic.primitives import CommonSubexpression if len(expr.parameters) in [1, 2]: if len(expr.parameters) == 2: - if not isinstance(expr.parameters[1], Variable): + if not isinstance(expr.parameters[1], p.Variable): raise TypeError("second argument to cse() must be a symbol") tag = expr.parameters[1].name else: tag = None - return CommonSubexpression( + return p.CommonSubexpression( self.rec(expr.parameters[0]), tag) else: raise TypeError("cse takes two arguments") @@ -965,7 +962,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): if len(expr.parameters) == 3: operation, inames, red_expr = expr.parameters - if not isinstance(operation, Variable): + if not isinstance(operation, p.Variable): raise TypeError("operation argument to reduce() " "must be a symbol") @@ -1098,8 +1095,7 @@ class ArrayAccessFinder(CombineMapper): return set() def map_subscript(self, expr): - from pymbolic.primitives import Variable - assert isinstance(expr.aggregate, Variable) + assert isinstance(expr.aggregate, p.Variable) if self.tgt_vector_name is None \ or expr.aggregate.name == self.tgt_vector_name: @@ -1142,12 +1138,17 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return pw_aff pieces = pw_aff.get_pieces() + last_expr = aff_to_expr(pieces[-1][1]) - if len(pieces) != 1: - raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances") + pairs = [(set_to_cond_expr(constr_set), aff_to_expr(aff)) + for constr_set, aff in pieces[:-1]] - (set, aff), = pieces - return aff_to_expr(aff) + from pymbolic.primitives import If + expr = last_expr + for condition, then_expr in reversed(pairs): + expr = If(condition, then_expr, expr) + + return expr # }}} @@ -1255,7 +1256,7 @@ def simplify_using_aff(kernel, expr): # }}} -# {{{ expression <-> constraint conversion +# {{{ expression/set <-> constraint conversion def eq_constraint_from_expr(space, expr): return isl.Constraint.equality_from_aff(aff_from_expr(space, expr)) @@ -1265,7 +1266,7 @@ def ineq_constraint_from_expr(space, expr): return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr)) -def constraint_to_expr(cns): +def constraint_to_cond_expr(cns): # Looks like this is ok after all--get_aff() performs some magic. # Not entirely sure though... FIXME # @@ -1284,6 +1285,39 @@ def constraint_to_expr(cns): # }}} +# {{{ set_to_cond_expr + +def basic_set_to_cond_expr(isl_basicset): + constrs = [] + for constr in isl_basicset.get_constraints(): + constrs.append(constraint_to_cond_expr(constr)) + + if len(constrs) == 0: + raise ValueError("may not be called on universe") + elif len(constrs) == 1: + constr, = constrs + return constr + else: + return p.LogicalAnd(tuple(constrs)) + + +def set_to_cond_expr(isl_set): + conjs = [] + for isl_basicset in isl_set.get_basic_sets(): + conjs.append(basic_set_to_cond_expr(isl_basicset)) + + if len(conjs) == 0: + raise ValueError("may not be called on universe") + elif len(conjs) == 1: + conj, = conjs + return conj + else: + return p.LogicalOr(tuple(conjs)) + + +# }}} + + # {{{ Reduction callback mapper class ReductionCallbackMapper(IdentityMapper): @@ -1318,10 +1352,9 @@ class IndexVariableFinder(CombineMapper): def map_subscript(self, expr): idx_vars = DependencyMapper()(expr.index) - from pymbolic.primitives import Variable result = set() for idx_var in idx_vars: - if isinstance(idx_var, Variable): + if isinstance(idx_var, p.Variable): result.add(idx_var.name) else: raise RuntimeError("index variable not understood: %s" % idx_var) @@ -1432,8 +1465,7 @@ class AccessRangeMapper(WalkMapper): domain = self.kernel.get_inames_domain(inames) WalkMapper.map_subscript(self, expr, inames) - from pymbolic.primitives import Variable - assert isinstance(expr.aggregate, Variable) + assert isinstance(expr.aggregate, p.Variable) if expr.aggregate.name != self.arg_name: return @@ -1476,8 +1508,7 @@ def is_expression_equal(a, b): if a == b: return True - from pymbolic.primitives import Expression - if isinstance(a, Expression) or isinstance(b, Expression): + if isinstance(a, p.Expression) or isinstance(b, p.Expression): if a is None or b is None: return False diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 409b9badb639c500e70404e781036b2e39bf333f..5d5743bae322fc59c989cafd85122c8ca619c422 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -216,6 +216,9 @@ class ASTBuilderBase(object): def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): raise NotImplementedError() + def emit_declaration_scope(self, codegen_state, inner): + raise NotImplementedError() + def emit_blank_line(self): raise NotImplementedError() @@ -267,6 +270,10 @@ class DummyHostASTBuilder(ASTBuilderBase): def ast_block_class(self): return _DummyASTBlock + @property + def ast_block_scope_class(self): + return _DummyASTBlock + def emit_assignment(self, codegen_state, insn): return None diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 50ae5856bce6eab8cf874242558a6c34f63a2d79..be83ec90c4720f10876e1a5e47a43c429fc40aeb 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1,4 +1,4 @@ -"""OpenCL target independent of PyOpenCL.""" +"""Plain C target and base for other C-family languages.""" from __future__ import division, absolute_import @@ -29,10 +29,11 @@ import six import numpy as np # noqa from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError -from cgen import Pointer +from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper +import pymbolic.primitives as p from pytools import memoize_method @@ -131,6 +132,16 @@ class POD(Declarator): mapper_method = "map_loopy_pod" + +class ScopingBlock(Block): + """A block that is mandatory for scoping and may not be simplified away + by :func:`loopy.codegen.results.merge_codegen_results`. + """ + + +class FunctionDeclarationWrapper(NestedDeclarator): + mapper_method = "map_function_decl_wrapper" + # }}} @@ -202,6 +213,10 @@ class CASTIdentityMapper(CASTIdentityMapperBase): def map_loopy_pod(self, node, *args, **kwargs): return type(node)(node.ast_builder, node.dtype, node.name) + def map_function_decl_wrapper(self, node, *args, **kwargs): + return FunctionDeclarationWrapper( + self.rec(node.subdecl, *args, **kwargs)) + class SubscriptSubsetCounter(IdentityMapper): def __init__(self, subset_counters): @@ -333,7 +348,7 @@ class CASTBuilder(ASTBuilderBase): index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( self.get_temporary_decl( - kernel, schedule_index, tv, + codegen_state, schedule_index, tv, decl_info)) if tv.initializer is not None: @@ -377,10 +392,11 @@ class CASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" - return FunctionDeclaration( - Value("void", name), - [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info]) + return FunctionDeclarationWrapper( + FunctionDeclaration( + Value("void", name), + [self.idi_to_cgen_declarator(codegen_state.kernel, idi) + for idi in codegen_state.implemented_data_info])) def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope @@ -409,7 +425,8 @@ class CASTBuilder(ASTBuilderBase): if tv.scope != temp_var_scope.GLOBAL: decl = self.wrap_temporary_decl( self.get_temporary_decl( - kernel, schedule_index, tv, idi), tv.scope) + codegen_state, schedule_index, tv, idi), + tv.scope) if tv.initializer is not None: decl = Initializer(decl, generate_array_literal( @@ -467,12 +484,21 @@ class CASTBuilder(ASTBuilderBase): idi.dtype.itemsize * product(si for si in idi.shape)) + ecm = self.get_expression_to_code_mapper(codegen_state) + for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) - bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) + + # FIXME: Could try to use isl knowledge to simplify max. + if all(isinstance(bs, int) for bs in bs_sizes): + bs_size_max = max(bs_sizes) + else: + bs_size_max = p.Max(tuple(bs_sizes)) + + bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) @@ -493,6 +519,10 @@ class CASTBuilder(ASTBuilderBase): from cgen import Block return Block + @property + def ast_block_scope_class(self): + return ScopingBlock + # }}} # {{{ code generation guts @@ -509,7 +539,7 @@ class CASTBuilder(ASTBuilderBase): from loopy.target.c.codegen.expression import CExpressionToCodeMapper return CExpressionToCodeMapper() - def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if temp_var.read_only: @@ -518,8 +548,10 @@ class CASTBuilder(ASTBuilderBase): if decl_info.shape: from cgen import ArrayOf + ecm = self.get_expression_to_code_mapper(codegen_state) temp_var_decl = ArrayOf(temp_var_decl, - " * ".join(str(s) for s in decl_info.shape)) + ecm(p.flattened_product(decl_info.shape), + prec=PREC_NONE, type_context="i")) return temp_var_decl @@ -690,11 +722,9 @@ class CASTBuilder(ASTBuilderBase): CExpression(self.get_c_expression_to_code_mapper(), result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE @@ -703,12 +733,12 @@ class CASTBuilder(ASTBuilderBase): return For( InlineInitializer( POD(self, iname_dtype, iname), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( Comparison( var(iname), "<=", - aff_to_expr(static_ubound)), + ubound), PREC_NONE, "i"), "++%s" % iname, inner) @@ -743,4 +773,45 @@ class CASTBuilder(ASTBuilderBase): return node +# {{{ header generation + +class CFunctionDeclExtractor(CASTIdentityMapper): + def __init__(self): + self.decls = [] + + def map_expression(self, expr): + return expr + + def map_function_decl_wrapper(self, node): + self.decls.append(node.subdecl) + return super(CFunctionDeclExtractor, self)\ + .map_function_decl_wrapper(node) + + +def generate_header(kernel, codegen_result=None): + """ + :arg kernel: a :class:`loopy.LoopKernel` + :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult` + :returns: a list of AST nodes (which may have :func:`str` + called on them to produce a string) representing + function declarations for the generated device + functions. + """ + + if not isinstance(kernel.target, CTarget): + raise LoopyError( + 'Header generation for non C-based languages are not implemented') + + if codegen_result is None: + from loopy.codegen import generate_code_v2 + codegen_result = generate_code_v2(kernel) + + fde = CFunctionDeclExtractor() + for dev_prg in codegen_result.device_programs: + fde(dev_prg.ast) + + return fde.decls + +# }}} + # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 91c42c542f67412b749ce739c3fda56b3ead4d7f..68cc32e56be077c7e45d11b9e2aade86b04494cc 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -36,7 +36,8 @@ import pymbolic.primitives as p from pymbolic import var -from loopy.expression import dtype_to_type_context, TypeInferenceMapper +from loopy.expression import dtype_to_type_context +from loopy.type_inference import TypeInferenceMapper from loopy.diagnostic import LoopyError, LoopyWarning from loopy.tools import is_integer @@ -104,7 +105,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.infer_type(expr), needed_dtype, RecursiveMapper.rec(self, expr, type_context)) - def __call__(self, expr, prec, type_context=None, needed_dtype=None): + def __call__(self, expr, prec=None, type_context=None, needed_dtype=None): + if prec is None: + prec = PREC_NONE + assert prec == PREC_NONE from loopy.target.c import CExpression return CExpression( @@ -144,6 +148,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ValueArg if isinstance(arg, ValueArg) and self.fortran_abi: postproc = lambda x: x[0] # noqa + elif expr.name in self.kernel.temporary_variables: + temporary = self.kernel.temporary_variables[expr.name] + if temporary.base_storage: + postproc = lambda x: x[0] # noqa result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) if result is not None: @@ -212,12 +220,15 @@ class ExpressionToCExpressionMapper(IdentityMapper): elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: - if isinstance(ary, GlobalArg) or isinstance(ary, ConstantArg): + if ( + (isinstance(ary, (ConstantArg, GlobalArg)) or + (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars + # (unless they use base_storage) result = var(access_info.array_name) else: @@ -675,6 +686,10 @@ class CExpressionToCodeMapper(RecursiveMapper): return f % tuple( self.rec(i, prec) for i in iterable) + def join(self, joiner, iterable): + f = joiner.join("%s" for i in iterable) + return f % tuple(iterable) + # }}} def map_constant(self, expr, prec): @@ -769,9 +784,19 @@ class CExpressionToCodeMapper(RecursiveMapper): enclosing_prec, PREC_LOGICAL_AND) def map_logical_or(self, expr, enclosing_prec): - return self.parenthesize_if_needed( - self.join_rec(" || ", expr.children, PREC_LOGICAL_OR), - enclosing_prec, PREC_LOGICAL_OR) + mapped_children = [] + from pymbolic.primitives import LogicalAnd + for child in expr.children: + mapped_child = self.rec(child, PREC_LOGICAL_OR) + # clang warns on unparenthesized && within || + if isinstance(child, LogicalAnd): + mapped_child = "(%s)" % mapped_child + mapped_children.append(mapped_child) + + result = self.join(" || ", mapped_children) + if enclosing_prec > PREC_LOGICAL_OR: + result = "(%s)" % result + return result def map_sum(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_SUM diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d31718f15ed563bba0b602e6017536b72b6deed0..2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -99,6 +99,7 @@ def _create_vector_types(): vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + _create_vector_types() @@ -147,7 +148,7 @@ class ExpressionToCudaCExpressionMapper(ExpressionToCExpressionMapper): def _get_index_ctype(kernel): if kernel.index_dtype.numpy_dtype == np.int32: return "int32_t" - elif kernel.index_dtype.numpy_dtype == np.int32: + elif kernel.index_dtype.numpy_dtype == np.int64: return "int64_t" else: raise LoopyError("unexpected index type") @@ -232,6 +233,10 @@ class CUDACASTBuilder(CASTBuilder): fdecl = super(CUDACASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) + from loopy.target.c import FunctionDeclarationWrapper + assert isinstance(fdecl, FunctionDeclarationWrapper) + fdecl = fdecl.subdecl + from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) @@ -254,7 +259,7 @@ class CUDACASTBuilder(CASTBuilder): fdecl = CudaLaunchBounds(nthreads, fdecl) - return fdecl + return FunctionDeclarationWrapper(fdecl) def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( @@ -313,7 +318,7 @@ class CUDACASTBuilder(CASTBuilder): % scope) def wrap_global_constant(self, decl): - from cgen.opencl import CudaConstant + from cgen.cuda import CudaConstant return CudaConstant(decl) def get_global_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 536a186e7ef62bd2644ba81a11cc61a2079ac2be..80a69bd00c99258b709ea18b2a716c339b888b02 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -32,6 +32,7 @@ from loopy.diagnostic import LoopyError from loopy.symbolic import Literal from pymbolic import var import pymbolic.primitives as p +from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method @@ -236,16 +237,19 @@ class ISPCASTBuilder(CASTBuilder): arg_names, arg_decls = self._arg_names_and_decls(codegen_state) if codegen_state.is_generating_device_code: - return ISPCTask( + result = ISPCTask( FunctionDeclaration( Value("void", name), arg_decls)) else: - return ISPCExport( + result = ISPCExport( FunctionDeclaration( Value("void", name), arg_decls)) + from loopy.target.c import FunctionDeclarationWrapper + return FunctionDeclarationWrapper(result) + # }}} def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): @@ -295,7 +299,7 @@ class ISPCASTBuilder(CASTBuilder): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, knl, sched_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -306,13 +310,16 @@ class ISPCASTBuilder(CASTBuilder): # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) - _, lsize = knl.get_grid_size_upper_bounds_as_exprs() + _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() shape = lsize + shape if shape: from cgen import ArrayOf - temp_var_decl = ArrayOf(temp_var_decl, - " * ".join(str(s) for s in shape)) + ecm = self.get_expression_to_code_mapper(codegen_state) + temp_var_decl = ArrayOf( + temp_var_decl, + ecm(p.flattened_product(shape), + prec=PREC_NONE, type_context="i")) return temp_var_decl @@ -465,23 +472,22 @@ class ISPCASTBuilder(CASTBuilder): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, Initializer + from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform return For( - Initializer( + InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( - p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)), + p.Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index f0436099c6127e6426b03df2c48342b6ee99c67f..31cf7c6b648ebf370a17d8beb2538b9748ddb30a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -126,6 +126,7 @@ def _create_vector_types(): vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + _create_vector_types() @@ -400,6 +401,10 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) + from loopy.target.c import FunctionDeclarationWrapper + assert isinstance(fdecl, FunctionDeclarationWrapper) + fdecl = fdecl.subdecl + from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) @@ -415,7 +420,7 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) - return fdecl + return FunctionDeclarationWrapper(fdecl) def generate_top_of_body(self, codegen_state): from loopy.kernel.data import ImageArg diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 540cad00036de046484357826781353a927d7497..61e8e4f396126e17123c1bf775dbfeee2fe21f0d 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six from six.moves import range, zip -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import ParameterFinderWarning from pytools.py_codegen import ( Indentation, PythonFunctionGenerator) @@ -610,7 +610,7 @@ def generate_invoker(kernel, codegen_result): # {{{ kernel executor -class _CLKernelInfo(Record): +class _CLKernelInfo(ImmutableRecord): pass diff --git a/loopy/target/python.py b/loopy/target/python.py index 591161d818bf6691a0412b3a00d624f8b02dde5b..09a86665b7d949d7bf35b910cd2a6fd66109c1ec 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -29,10 +29,11 @@ import numpy as np from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper -from loopy.expression import TypeInferenceMapper +from loopy.type_inference import TypeInferenceMapper from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase +from genpy import Suite # {{{ expression to code @@ -129,6 +130,30 @@ class ExpressionToPythonMapper(StringifyMapper): def map_local_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have local hw axes") + def map_if(self, expr, enclosing_prec): + # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the + # operator precedence hierarchy (right above "or"). + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 + + return self.parenthesize_if_needed( + "{then} if {cond} else {else_}".format( + then=self.rec(expr.then, PREC_IFTHENELSE), + cond=self.rec(expr.condition, PREC_IFTHENELSE), + else_=self.rec(expr.else_, PREC_IFTHENELSE)), + enclosing_prec, PREC_NONE) + +# }}} + + +# {{{ genpy extensions + +class Collection(Suite): + def generate(self): + for item in self.contents: + for item_line in item.generate(): + yield item_line + # }}} @@ -219,15 +244,19 @@ class PythonASTBuilderBase(ASTBuilderBase): @property def ast_block_class(self): - from genpy import Suite return Suite + @property + def ast_block_scope_class(self): + # Once a new version of genpy is released, switch to this: + # from genpy import Collection + # and delete the implementation above. + return Collection + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic.mapper.stringifier import PREC_NONE from genpy import For @@ -235,8 +264,8 @@ class PythonASTBuilderBase(ASTBuilderBase): (iname,), "range(%s, %s + 1)" % ( - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), - ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), + ecm(lbound, PREC_NONE, "i"), + ecm(ubound, PREC_NONE, "i"), ), inner) diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py index 570b5efffb29e0ebb56b99444db19766127be596..f42fd3c8d2943bb37b75e9ef0003b88985950926 100644 --- a/loopy/transform/__init__.py +++ b/loopy/transform/__init__.py @@ -21,6 +21,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - - - diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index 3c7bfed43b9bd02a4be3d71b2317cee94da75b4b..f4e6526a7b083f0b38dda1209b607aa38a62b68e 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -28,11 +28,11 @@ from islpy import dim_type from loopy.symbolic import (get_dependencies, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method from pymbolic import var -class AccessDescriptor(Record): +class AccessDescriptor(ImmutableRecord): """ .. attribute:: identifier diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b2c86c084f0c56ebfb6ec8ebe4f6f5e65c5fd37d..92cff7a507d672a3acc51a8abed572a04cb7e86a 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -500,7 +500,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), - no_sync_with=frozenset([init_insn_id]), + no_sync_with=frozenset([(init_insn_id, "any")]), assignee=store_target, expression=store_expression, within_inames=( diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index bf6a6e1b98e6abbc4b483383f4bb9cf8b06bed1a..c35b5064365293ac78cdd01af537c9d28bd67193 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -79,8 +79,8 @@ __doc__ = """ def set_loop_priority(kernel, loop_priority): from warnings import warn - warn("set_loop_priority is deprecated. Use prioritize_loops instead." - "Attention: A call to set_loop_priority will overwrite any previously" + warn("set_loop_priority is deprecated. Use prioritize_loops instead. " + "Attention: A call to set_loop_priority will overwrite any previously " "set priorities!", DeprecationWarning, stacklevel=2) if isinstance(loop_priority, str): diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 5566077128a3f2514f2f86d04748935e7b3ff18b..7c9c9688604179dce2aa7dcd6954d76a0df32cc7 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -76,21 +76,34 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency -def add_dependency(kernel, insn_match, dependency): +def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. - *insn_match* may be any instruction id match understood by + *insn_match* and *depends_on* may be any instruction id match understood by :func:`loopy.match.parse_match`. + + .. versionchanged:: 2016.3 + + Third argument renamed to *depends_on* for clarity, allowed to + be not just ID but also match expression. """ - if dependency not in kernel.id_to_insn: - raise LoopyError("cannot add dependency on non-existent instruction ID '%s'" - % dependency) + if isinstance(depends_on, str) and depends_on in kernel.id_to_insn: + added_deps = frozenset([depends_on]) + else: + added_deps = frozenset( + dep.id for dep in find_instructions(kernel, depends_on)) + + if not added_deps: + raise LoopyError("no instructions found matching '%s' " + "(to add as dependencies)" % depends_on) + + matched = [False] def add_dep(insn): new_deps = insn.depends_on - added_deps = frozenset([dependency]) + matched[0] = True if new_deps is None: new_deps = added_deps else: @@ -98,7 +111,13 @@ def add_dependency(kernel, insn_match, dependency): return insn.copy(depends_on=new_deps) - return map_instructions(kernel, insn_match, add_dep) + result = map_instructions(kernel, insn_match, add_dep) + + if not matched[0]: + raise LoopyError("no instructions found matching '%s' " + "(to which dependencies would be added)" % insn_match) + + return result # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 5ab9dfab3c8ac0669c3e7eaf4091bb3ab4b0e2a2..a19e06ecdf7c9966501ebb9600ea4e01614363f4 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -799,7 +799,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if temporary_scope == temp_var_scope.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( - based_on=c_subst_name+"_b") + based_on=c_subst_name+"_barrier") from loopy.kernel.instruction import BarrierInstruction barrier_insn = BarrierInstruction( id=barrier_insn_id, diff --git a/loopy/transform/save.py b/loopy/transform/save.py new file mode 100644 index 0000000000000000000000000000000000000000..8706bc4da70b94ad678f07158e0a0f648fdd0030 --- /dev/null +++ b/loopy/transform/save.py @@ -0,0 +1,587 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from loopy.diagnostic import LoopyError +import loopy as lp + +from loopy.kernel.data import auto +from pytools import memoize_method, Record +from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier) + +from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) + + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: save_and_reload_temporaries +""" + + +# {{{ liveness analysis + +class LivenessResult(dict): + + class InstructionResult(Record): + __slots__ = ["live_in", "live_out"] + + @classmethod + def make_empty(cls, nscheditems): + return cls((idx, cls.InstructionResult(live_in=set(), live_out=set())) + for idx in range(nscheditems)) + + +class LivenessAnalysis(object): + + def __init__(self, kernel): + self.kernel = kernel + self.schedule = self.kernel.schedule + + @memoize_method + def get_successor_relation(self): + successors = {} + block_bounds = get_block_boundaries(self.kernel.schedule) + + for idx, (item, next_item) in enumerate(zip( + reversed(self.schedule), + reversed(self.schedule + [None]))): + sched_idx = len(self.schedule) - idx - 1 + + # Look at next_item + if next_item is None: + after = set() + elif isinstance(next_item, EnterLoop): + # Account for empty loop + loop_end = block_bounds[sched_idx + 1] + after = successors[loop_end] | set([sched_idx + 1]) + elif isinstance(next_item, (LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + after = set([sched_idx + 1]) + else: + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(next_item).__name__)) + + # Look at item + if isinstance(item, LeaveLoop): + # Account for loop + loop_begin = block_bounds[sched_idx] + after |= set([loop_begin]) + elif not isinstance(item, (EnterLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(item).__name__)) + + successors[sched_idx] = after + + return successors + + def get_gen_and_kill_sets(self): + gen = dict((idx, set()) for idx in range(len(self.schedule))) + kill = dict((idx, set()) for idx in range(len(self.schedule))) + + for sched_idx, sched_item in enumerate(self.schedule): + if not isinstance(sched_item, RunInstruction): + continue + insn = self.kernel.id_to_insn[sched_item.insn_id] + for var in insn.assignee_var_names(): + if var not in self.kernel.temporary_variables: + continue + if not insn.predicates: + # Fully kills the liveness only when unconditional. + kill[sched_idx].add(var) + if len(self.kernel.temporary_variables[var].shape) > 0: + # For an array variable, all definitions generate a use as + # well, because the write could be a partial write, + # necessitating a reload of whatever is not written. + # + # We don't currently check if the write is a partial write + # or a full write. Instead, we analyze the access + # footprint later on to determine how much to reload/save. + gen[sched_idx].add(var) + for var in insn.read_dependency_names(): + if var not in self.kernel.temporary_variables: + continue + gen[sched_idx].add(var) + + return gen, kill + + @memoize_method + def liveness(self): + logging.info("running liveness analysis") + successors = self.get_successor_relation() + gen, kill = self.get_gen_and_kill_sets() + + # Fixed point iteration for liveness analysis + lr = LivenessResult.make_empty(len(self.schedule)) + + prev_lr = None + + while prev_lr != lr: + from copy import deepcopy + prev_lr = deepcopy(lr) + for idx in range(len(self.schedule) - 1, -1, -1): + for succ in successors[idx]: + lr[idx].live_out.update(lr[succ].live_in) + lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) + + logging.info("done running liveness analysis") + + return lr + + def print_liveness(self): + print(75 * "-") + print("LIVE IN:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_in)))) + print(75 * "-") + print("LIVE OUT:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_out)))) + print(75 * "-") + + def __getitem__(self, sched_idx): + """ + :arg insn: An instruction name or instance of + :class:`loopy.instruction.InstructionBase` + + :returns: A :class:`LivenessResult` associated with `insn` + """ + return self.liveness()[sched_idx] + +# }}} + + +# {{{ save and reload implementation + +class TemporarySaver(object): + + class PromotedTemporary(Record): + """ + .. attribute:: name + + The name of the new temporary. + + .. attribute:: orig_temporary + + The original temporary variable object. + + .. attribute:: hw_inames + + The common list of hw axes that define the original object. + + .. attribute:: hw_dims + + A list of expressions, to be added in front of the shape + of the promoted temporary value, corresponding to + hardware dimensions + + .. attribute:: non_hw_dims + + A list of expressions, to be added in front of the shape + of the promoted temporary value, corresponding to + non-hardware dimensions + """ + + @memoize_method + def as_variable(self): + temporary = self.orig_temporary + from loopy.kernel.data import TemporaryVariable, temp_var_scope + return TemporaryVariable( + name=self.name, + dtype=temporary.dtype, + scope=temp_var_scope.GLOBAL, + shape=self.new_shape) + + @property + def new_shape(self): + return self.hw_dims + self.non_hw_dims + + def __init__(self, kernel): + self.kernel = kernel + self.insn_query = InstructionQuery(kernel) + self.var_name_gen = kernel.get_var_name_generator() + self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. + self.insns_to_insert = [] + self.insns_to_update = {} + self.extra_args_to_add = {} + self.updated_iname_to_tag = {} + self.updated_temporary_variables = {} + self.saves_or_reloads_added = {} + + @memoize_method + def auto_promote_temporary(self, temporary_name): + temporary = self.kernel.temporary_variables[temporary_name] + + from loopy.kernel.data import temp_var_scope + if temporary.scope == temp_var_scope.GLOBAL: + # Nothing to be done for global temporaries (I hope) + return None + + if temporary.base_storage is not None: + raise ValueError( + "Cannot promote temporaries with base_storage to global") + + # `hw_inames`: The set of hw-parallel tagged inames that this temporary + # is associated with. This is used for determining the shape of the + # global storage needed for saving and restoring the temporary across + # kernel calls. + # + # TODO: Make a policy decision about which dimensions to use. Currently, + # the code looks at each instruction that defines or uses the temporary, + # and takes the common set of hw-parallel tagged inames associated with + # these instructions. + # + # Furthermore, in the case of local temporaries, inames that are tagged + # hw-local do not contribute to the global storage shape. + hw_inames = self.insn_query.common_hw_inames( + self.insn_query.insns_reading_or_writing(temporary.name)) + + # We want hw_inames to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + # Sorting lexicographically accomplishes this. + hw_inames = sorted(hw_inames, + key=lambda iname: str(self.kernel.iname_to_tag[iname])) + + # Calculate the sizes of the dimensions that get added in front for + # the global storage of the temporary. + hw_dims = [] + + backing_hw_inames = [] + + for iname in hw_inames: + tag = self.kernel.iname_to_tag[iname] + from loopy.kernel.data import LocalIndexTag + is_local_iname = isinstance(tag, LocalIndexTag) + if is_local_iname and temporary.scope == temp_var_scope.LOCAL: + # Restrict shape to that of group inames for locals. + continue + backing_hw_inames.append(iname) + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import aff_to_expr + hw_dims.append( + aff_to_expr( + static_max_of_pw_aff( + self.kernel.get_iname_bounds(iname).size, False))) + + non_hw_dims = temporary.shape + + if len(non_hw_dims) == 0 and len(hw_dims) == 0: + # Scalar not in hardware: ensure at least one dimension. + non_hw_dims = (1,) + + backing_temporary = self.PromotedTemporary( + name=self.var_name_gen(temporary.name + "_save_slot"), + orig_temporary=temporary, + hw_dims=tuple(hw_dims), + non_hw_dims=non_hw_dims, + hw_inames=backing_hw_inames) + + return backing_temporary + + def save_or_reload_impl(self, temporary, subkernel, mode, + promoted_temporary=lp.auto): + assert mode in ("save", "reload") + + if promoted_temporary is auto: + promoted_temporary = self.auto_promote_temporary(temporary) + + if promoted_temporary is None: + return + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger( + self.kernel, + frozenset( + self.insn_query.inames_in_subkernel(subkernel) | + set(promoted_temporary.hw_inames))) + + domain, hw_inames, dim_inames, iname_to_tag = \ + self.augment_domain_for_save_or_reload( + dchg.domain, promoted_temporary, mode, subkernel) + + self.kernel = dchg.get_kernel_with(domain) + + save_or_load_insn_id = self.insn_name_gen( + "{name}.{mode}".format(name=temporary, mode=mode)) + + def subscript_or_var(agg, subscript=()): + from pymbolic.primitives import Subscript, Variable + if len(subscript) == 0: + return Variable(agg) + else: + return Subscript( + Variable(agg), + tuple(map(Variable, subscript))) + + dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + + args = ( + subscript_or_var( + temporary, dim_inames_trunc), + subscript_or_var( + promoted_temporary.name, hw_inames + dim_inames)) + + if mode == "save": + args = reversed(args) + + accessing_insns_in_subkernel = ( + self.insn_query.insns_reading_or_writing(temporary) & + self.insn_query.insns_in_subkernel(subkernel)) + + if mode == "save": + depends_on = accessing_insns_in_subkernel + update_deps = frozenset() + elif mode == "reload": + depends_on = frozenset() + update_deps = accessing_insns_in_subkernel + + pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + + if pre_barrier is not None: + depends_on |= set([pre_barrier]) + + if post_barrier is not None: + update_deps |= set([post_barrier]) + + # Create the load / store instruction. + from loopy.kernel.data import Assignment + save_or_load_insn = Assignment( + *args, + id=save_or_load_insn_id, + within_inames=( + self.insn_query.inames_in_subkernel(subkernel) | + frozenset(hw_inames + dim_inames)), + within_inames_is_final=True, + depends_on=depends_on, + boostable=False, + boostable_into=frozenset()) + + if temporary not in self.saves_or_reloads_added: + self.saves_or_reloads_added[temporary] = set() + self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + + self.insns_to_insert.append(save_or_load_insn) + + for insn_id in update_deps: + insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id]) + self.insns_to_update[insn_id] = insn.copy( + depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) + + self.updated_temporary_variables[promoted_temporary.name] = \ + promoted_temporary.as_variable() + + self.updated_iname_to_tag.update(iname_to_tag) + + @memoize_method + def finish(self): + new_instructions = [] + + insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) + + # Add global no_sync_with between any added reloads and saves + from six import iteritems + for temporary, added_insns in iteritems(self.saves_or_reloads_added): + for insn_id in added_insns: + insn = insns_to_insert[insn_id] + insns_to_insert[insn_id] = insn.copy( + no_sync_with=frozenset( + (added_insn, "global") for added_insn in added_insns)) + + for orig_insn in self.kernel.instructions: + if orig_insn.id in self.insns_to_update: + new_instructions.append(self.insns_to_update[orig_insn.id]) + else: + new_instructions.append(orig_insn) + new_instructions.extend( + sorted(insns_to_insert.values(), key=lambda insn: insn.id)) + + self.updated_iname_to_tag.update(self.kernel.iname_to_tag) + self.updated_temporary_variables.update(self.kernel.temporary_variables) + + kernel = self.kernel.copy( + instructions=new_instructions, + iname_to_tag=self.updated_iname_to_tag, + temporary_variables=self.updated_temporary_variables) + + from loopy.kernel.tools import assign_automatic_axes + return assign_automatic_axes(kernel) + + def save(self, temporary, subkernel): + self.save_or_reload_impl(temporary, subkernel, "save") + + def reload(self, temporary, subkernel): + self.save_or_reload_impl(temporary, subkernel, "reload") + + def augment_domain_for_save_or_reload(self, + domain, promoted_temporary, mode, subkernel): + """ + Add new axes to the domain corresponding to the dimensions of + `promoted_temporary`. These axes will be used in the save/ + reload stage. + """ + assert mode in ("save", "reload") + import islpy as isl + + orig_temporary = promoted_temporary.orig_temporary + orig_dim = domain.dim(isl.dim_type.set) + + # Tags for newly added inames + iname_to_tag = {} + + # FIXME: Restrict size of new inames to access footprint. + + # Add dimension-dependent inames. + dim_inames = [] + domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + + for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): + new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". + format(name=orig_temporary.name, + mode=mode, + dim=dim_idx, + sk=subkernel)) + domain = domain.set_dim_name( + isl.dim_type.set, orig_dim + dim_idx, new_iname) + + if orig_temporary.is_local: + # If the temporary has local scope, then loads / stores can + # be done in parallel. + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() + + dim_inames.append(new_iname) + + # Add size information. + aff = isl.affs_from_space(domain.space) + domain &= aff[0].le_set(aff[new_iname]) + from loopy.symbolic import aff_from_expr + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) + + # FIXME: Use promoted_temporary.hw_inames + hw_inames = [] + + # Add hardware inames duplicates. + for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". + format(name=orig_temporary.name, + mode=mode, + dim=t_idx, + sk=subkernel)) + hw_inames.append(new_iname) + iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + + from loopy.isl_helpers import duplicate_axes + domain = duplicate_axes( + domain, promoted_temporary.hw_inames, hw_inames) + + # The operations on the domain above return a Set object, but the + # underlying domain should be expressible as a single BasicSet. + domain_list = domain.get_basic_set_list() + assert domain_list.n_basic_set() == 1 + domain = domain_list.get_basic_set(0) + return domain, hw_inames, dim_inames, iname_to_tag + +# }}} + + +# {{{ auto save and reload across kernel calls + +def save_and_reload_temporaries(knl): + """ + Add instructions to save and reload temporary variables that are live + across kernel calls. + + The basic code transformation turns schedule segments:: + + t = <...> + <return followed by call> + <...> = t + + into this code:: + + t = <...> + t_save_slot = t + <return followed by call> + t = t_save_slot + <...> = t + + where `t_save_slot` is a newly-created global temporary variable. + + :returns: The resulting kernel + """ + liveness = LivenessAnalysis(knl) + saver = TemporarySaver(knl) + + insn_query = InstructionQuery(knl) + + for sched_idx, sched_item in enumerate(knl.schedule): + + if isinstance(sched_item, CallKernel): + # Any written temporary that is live-out needs to be read into + # memory because of the potential for partial writes. + if sched_idx == 0: + # Kernel entry: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_read_or_written_in_subkernel( + sched_item.kernel_name)) + + for temporary in liveness[sched_idx].live_out & interesting_temporaries: + logger.info("reloading {0} at entry of {1}" + .format(temporary, sched_item.kernel_name)) + saver.reload(temporary, sched_item.kernel_name) + + elif isinstance(sched_item, ReturnFromKernel): + if sched_idx == len(knl.schedule) - 1: + # Kernel exit: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_written_in_subkernel( + sched_item.kernel_name)) + + for temporary in liveness[sched_idx].live_in & interesting_temporaries: + logger.info("saving {0} before return of {1}" + .format(temporary, sched_item.kernel_name)) + saver.save(temporary, sched_item.kernel_name) + + return saver.finish() + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b2b76ae9f3a92d93feca2dc9b31591f215b9341e..79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -31,7 +31,7 @@ from loopy.symbolic import ( from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func -from pytools import Record +from pytools import ImmutableRecord from pymbolic import var @@ -39,7 +39,7 @@ import logging logger = logging.getLogger(__name__) -class ExprDescriptor(Record): +class ExprDescriptor(ImmutableRecord): __slots__ = ["insn", "expr", "unif_var_dict"] diff --git a/loopy/type_inference.py b/loopy/type_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a31f011a0ce8e5403b54984eb45db0970a8370b0 --- /dev/null +++ b/loopy/type_inference.py @@ -0,0 +1,581 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +from pymbolic.mapper import CombineMapper +import numpy as np + +from loopy.tools import is_integer +from loopy.types import NumpyType + +from loopy.diagnostic import ( + LoopyError, + TypeInferenceFailure, DependencyTypeInferenceFailure) + +import logging +logger = logging.getLogger(__name__) + + +# {{{ type inference mapper + +class TypeInferenceMapper(CombineMapper): + def __init__(self, kernel, new_assignments=None): + """ + :arg new_assignments: mapping from names to either + :class:`loopy.kernel.data.TemporaryVariable` + or + :class:`loopy.kernel.data.KernelArgument` + instances + """ + self.kernel = kernel + if new_assignments is None: + new_assignments = {} + self.new_assignments = new_assignments + self.symbols_with_unknown_types = set() + + def __call__(self, expr, return_tuple=False, return_dtype_set=False): + kwargs = {} + if return_tuple: + kwargs["return_tuple"] = True + + result = super(TypeInferenceMapper, self).__call__( + expr, **kwargs) + + assert isinstance(result, list) + + if return_tuple: + for result_i in result: + assert isinstance(result_i, tuple) + + assert return_dtype_set + return result + + else: + if return_dtype_set: + return result + else: + if not result: + raise DependencyTypeInferenceFailure( + ", ".join(sorted(self.symbols_with_unknown_types))) + + result, = result + return result + + # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) + # are Python-equal (for many common constants such as integers). + + def copy(self): + return type(self)(self.kernel, self.new_assignments) + + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, new_ass) + + @staticmethod + def combine(dtype_sets): + """ + :arg dtype_sets: A list of lists, where each of the inner lists + consists of either zero or one type. An empty list is + consistent with any type. A list with a type requires + that an operation be valid in conjunction with that type. + """ + dtype_sets = list(dtype_sets) + + from loopy.types import LoopyType, NumpyType + assert all( + all(isinstance(dtype, LoopyType) for dtype in dtype_set) + for dtype_set in dtype_sets) + assert all( + 0 <= len(dtype_set) <= 1 + for dtype_set in dtype_sets) + + if not all( + isinstance(dtype, NumpyType) + for dtype_set in dtype_sets + for dtype in dtype_set): + from pytools import is_single_valued, single_valued + if not is_single_valued( + dtype + for dtype_set in dtype_sets + for dtype in dtype_set): + raise TypeInferenceFailure( + "Nothing known about operations between '%s'" + % ", ".join(str(dtype) + for dtype_set in dtype_sets + for dtype in dtype_set)) + + return single_valued(dtype + for dtype_set in dtype_sets + for dtype in dtype_set) + + numpy_dtypes = [dtype.dtype + for dtype_set in dtype_sets + for dtype in dtype_set] + + if not numpy_dtypes: + return [] + + result = numpy_dtypes.pop() + while numpy_dtypes: + other = numpy_dtypes.pop() + + if result.fields is None and other.fields is None: + if (result, other) in [ + (np.int32, np.float32), (np.float32, np.int32)]: + # numpy makes this a double. I disagree. + result = np.dtype(np.float32) + else: + result = ( + np.empty(0, dtype=result) + + np.empty(0, dtype=other) + ).dtype + + elif result.fields is None and other.fields is not None: + # assume the non-native type takes over + # (This is used for vector types.) + result = other + elif result.fields is not None and other.fields is None: + # assume the non-native type takes over + # (This is used for vector types.) + pass + else: + if result is not other: + raise TypeInferenceFailure( + "nothing known about result of operation on " + "'%s' and '%s'" % (result, other)) + + return [NumpyType(result)] + + def map_sum(self, expr): + dtype_sets = [] + small_integer_dtype_sets = [] + for child in expr.children: + dtype_set = self.rec(child) + if is_integer(child) and abs(child) < 1024: + small_integer_dtype_sets.append(dtype_set) + else: + dtype_sets.append(dtype_set) + + from pytools import all + if all(dtype.is_integral() + for dtype_set in dtype_sets + for dtype in dtype_set): + dtype_sets.extend(small_integer_dtype_sets) + + return self.combine(dtype_sets) + + map_product = map_sum + + def map_quotient(self, expr): + n_dtype_set = self.rec(expr.numerator) + d_dtype_set = self.rec(expr.denominator) + + dtypes = n_dtype_set + d_dtype_set + + if all(dtype.is_integral() for dtype in dtypes): + # both integers + return [NumpyType(np.dtype(np.float64))] + + else: + return self.combine([n_dtype_set, d_dtype_set]) + + def map_constant(self, expr): + if is_integer(expr): + for tp in [np.int32, np.int64]: + iinfo = np.iinfo(tp) + if iinfo.min <= expr <= iinfo.max: + return [NumpyType(np.dtype(tp))] + + else: + raise TypeInferenceFailure("integer constant '%s' too large" % expr) + + dt = np.asarray(expr).dtype + if hasattr(expr, "dtype"): + return [NumpyType(expr.dtype)] + elif isinstance(expr, np.number): + # Numpy types are sized + return [NumpyType(np.dtype(type(expr)))] + elif dt.kind == "f": + # deduce the smaller type by default + return [NumpyType(np.dtype(np.float32))] + elif dt.kind == "c": + if np.complex64(expr) == np.complex128(expr): + # (COMPLEX_GUESS_LOGIC) + # No precision is lost by 'guessing' single precision, use that. + # This at least covers simple cases like '1j'. + return [NumpyType(np.dtype(np.complex64))] + + # Codegen for complex types depends on exactly correct types. + # Refuse temptation to guess. + raise TypeInferenceFailure("Complex constant '%s' needs to " + "be sized for type inference " % expr) + else: + raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) + + def map_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_linear_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_call(self, expr, return_tuple=False): + from pymbolic.primitives import Variable + + identifier = expr.function + if isinstance(identifier, Variable): + identifier = identifier.name + + if identifier in ["indexof", "indexof_vec"]: + return [self.kernel.index_dtype] + + def none_if_empty(d): + if d: + d, = d + return d + else: + return None + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) + if None in arg_dtypes: + return [] + + mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct assignments") + + return [mangle_result.result_dtypes[0]] + + raise RuntimeError("unable to resolve " + "function '%s' with %d given arguments" + % (identifier, len(arg_dtypes))) + + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return [self.kernel.index_dtype] + + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + + if result is not None: + result_dtype, _ = result + return [result_dtype] + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, TemporaryVariable): + result = [obj.dtype] + if result[0] is lp.auto: + self.symbols_with_unknown_types.add(expr.name) + return [] + else: + return result + + elif isinstance(obj, KernelArgument): + result = [obj.dtype] + if result[0] is None: + self.symbols_with_unknown_types.add(expr.name) + return [] + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + + map_tagged_variable = map_variable + + def map_lookup(self, expr): + agg_result = self.rec(expr.aggregate) + if not agg_result: + return agg_result + + field = agg_result[0].numpy_dtype.fields[expr.name] + dtype = field[0] + return [NumpyType(dtype)] + + def map_comparison(self, expr): + # "bool" is unusable because OpenCL's bool has indeterminate memory + # format. + return [NumpyType(np.dtype(np.int32))] + + map_logical_not = map_comparison + map_logical_and = map_comparison + map_logical_or = map_comparison + + def map_group_hw_index(self, expr, *args): + return [self.kernel.index_dtype] + + def map_local_hw_index(self, expr, *args): + return [self.kernel.index_dtype] + + def map_reduction(self, expr, return_tuple=False): + rec_result = self.rec(expr.expr) + + if rec_result: + rec_result, = rec_result + result = expr.operation.result_dtypes( + self.kernel, rec_result, expr.inames) + else: + result = expr.operation.result_dtypes( + self.kernel, None, expr.inames) + + if result is None: + return [] + + if return_tuple: + return [result] + + else: + if len(result) != 1 and not return_tuple: + raise LoopyError("reductions with more or fewer than one " + "return value may only be used in direct assignments") + + return [result[0]] + +# }}} + + +# {{{ infer single variable + +def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): + return [kernel.index_dtype], [] + + def debug(s): + logger.debug("%s: %s" % (kernel.name, s)) + + dtype_sets = [] + + import loopy as lp + + type_inf_mapper = type_inf_mapper.copy() + + for writer_insn_id in kernel.writer_map().get(var_name, []): + writer_insn = kernel.id_to_insn[writer_insn_id] + if not isinstance(writer_insn, lp.MultiAssignmentBase): + continue + + expr = subst_expander(writer_insn.expression) + + debug(" via expr %s" % expr) + if isinstance(writer_insn, lp.Assignment): + result = type_inf_mapper(expr, return_dtype_set=True) + elif isinstance(writer_insn, lp.CallInstruction): + return_dtype_set = type_inf_mapper(expr, return_tuple=True, + return_dtype_set=True) + + result = [] + for return_dtype_set in return_dtype_set: + result_i = None + found = False + for assignee, comp_dtype_set in zip( + writer_insn.assignee_var_names(), return_dtype_set): + if assignee == var_name: + found = True + result_i = comp_dtype_set + break + + assert found + if result_i is not None: + result.append(result_i) + + debug(" result: %s" % result) + + dtype_sets.append(result) + + if not dtype_sets: + return None, type_inf_mapper.symbols_with_unknown_types + + result = type_inf_mapper.combine(dtype_sets) + + return result, type_inf_mapper.symbols_with_unknown_types + +# }}} + + +class _DictUnionView: + def __init__(self, children): + self.children = children + + def get(self, key): + try: + return self[key] + except KeyError: + return None + + def __getitem__(self, key): + for ch in self.children: + try: + return ch[key] + except KeyError: + pass + + raise KeyError(key) + + +# {{{ infer_unknown_types + +def infer_unknown_types(kernel, expect_completion=False): + """Infer types on temporaries and arguments.""" + + logger.debug("%s: infer types" % kernel.name) + + def debug(s): + logger.debug("%s: %s" % (kernel.name, s)) + + unexpanded_kernel = kernel + if kernel.substitutions: + from loopy.transform.subst import expand_subst + kernel = expand_subst(kernel) + + new_temp_vars = kernel.temporary_variables.copy() + new_arg_dict = kernel.arg_dict.copy() + + # {{{ find names_with_unknown_types + + # contains both arguments and temporaries + names_for_type_inference = [] + + import loopy as lp + for tv in six.itervalues(kernel.temporary_variables): + if tv.dtype is lp.auto: + names_for_type_inference.append(tv.name) + + for arg in kernel.args: + if arg.dtype is None: + names_for_type_inference.append(arg.name) + + # }}} + + item_lookup = _DictUnionView([ + new_temp_vars, + new_arg_dict + ]) + type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + # {{{ work on type inference queue + + from loopy.kernel.data import TemporaryVariable, KernelArgument + + changed_during_last_queue_run = False + queue = names_for_type_inference[:] + + failed_names = set() + while queue or changed_during_last_queue_run: + if not queue and changed_during_last_queue_run: + changed_during_last_queue_run = False + queue = names_for_type_inference[:] + + name = queue.pop(0) + item = item_lookup[name] + + debug("inferring type for %s %s" % (type(item).__name__, item.name)) + + result, symbols_with_unavailable_types = \ + _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) + + failed = not result + if not failed: + new_dtype, = result + debug(" success: %s" % new_dtype) + if new_dtype != item.dtype: + debug(" changed from: %s" % item.dtype) + changed_during_last_queue_run = True + + if isinstance(item, TemporaryVariable): + new_temp_vars[name] = item.copy(dtype=new_dtype) + elif isinstance(item, KernelArgument): + new_arg_dict[name] = item.copy(dtype=new_dtype) + else: + raise LoopyError("unexpected item type in type inference") + else: + debug(" failure") + + if failed: + if item.name in failed_names: + # this item has failed before, give up. + advice = "" + if symbols_with_unavailable_types: + advice += ( + " (need type of '%s'--check for missing arguments)" + % ", ".join(symbols_with_unavailable_types)) + + if expect_completion: + raise LoopyError( + "could not determine type of '%s'%s" + % (item.name, advice)) + + else: + # We're done here. + break + + # remember that this item failed + failed_names.add(item.name) + + if set(queue) == failed_names: + # We did what we could... + print(queue, failed_names, item.name) + assert not expect_completion + break + + # can't infer type yet, put back into queue + queue.append(name) + else: + # we've made progress, reset failure markers + failed_names = set() + + # }}} + + return unexpanded_kernel.copy( + temporary_variables=new_temp_vars, + args=[new_arg_dict[arg.name] for arg in kernel.args], + ) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/types.py b/loopy/types.py index b897d9f700b198e73d95a09c7d459ed2d7f877b1..f095d1d58f9eaebb7dcc9c8d41afa73951f2ba84 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -177,7 +177,8 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} -def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False): +def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False, + target=None): from loopy.kernel.data import auto if allow_none and dtype is None: return dtype @@ -192,10 +193,13 @@ def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False): except Exception: pass + if numpy_dtype is None and target is not None and isinstance(dtype, str): + numpy_dtype = target.get_dtype_registry().get_or_register_dtype(dtype) + if isinstance(dtype, LoopyType): if for_atomic: if isinstance(dtype, NumpyType): - return AtomicNumpyType(dtype.dtype) + return AtomicNumpyType(dtype.dtype, target=target) elif not isinstance(dtype, AtomicType): raise LoopyError("do not know how to convert '%s' to an atomic type" % dtype) @@ -204,9 +208,9 @@ def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False): elif numpy_dtype is not None: if for_atomic: - return AtomicNumpyType(numpy_dtype) + return AtomicNumpyType(numpy_dtype, target=target) else: - return NumpyType(numpy_dtype) + return NumpyType(numpy_dtype, target=target) else: raise TypeError("dtype must be a LoopyType, or convertible to one, " diff --git a/loopy/version.py b/loopy/version.py index aa3e7abee41a05595985df574da52c024b52dcb5..f7d157f650304a83164e11763279d3c5eabbc4c0 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v44-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v49-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 5c8f377a6855d0cb3e2f7e759d1dc0b314f31817..a941eecd2b58daf413830fc22500179d3e8a8cf1 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2016.1", + "pytools>=2016.2.6", "pymbolic>=2016.2", "genpy>=2016.1.2", "cgen>=2016.1", diff --git a/test/test_apps.py b/test/test_apps.py index 790a44f6acac72e4fa6fe04a45f32813e6204bb9..9eab3fdb1fbc152b65344362d39766793d372d90 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -502,6 +502,112 @@ def test_lbm(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20}) +def test_fd_demo(): + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n}", + "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") + #assumptions="n mod 16=0") + knl = lp.split_iname(knl, + "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, + "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "u", + ["i_inner", "j_inner"], + fetch_bounding_box=True) + + #n = 1000 + #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) + + knl = lp.set_options(knl, write_cl=True) + knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) + code, inf = lp.generate_code(knl) + print(code) + + assert "double" not in code + + +def test_fd_1d(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "result[i] = u[i+1]-u[i]") + + knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") + knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") + knl = lp.assume(knl, "n mod 16 = 0") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=2048)) + + +def test_poisson_fem(ctx_factory): + # Stolen from Peter Coogan and Rob Kirby for FEM assembly + ctx = ctx_factory() + + nbf = 5 + nqp = 5 + sdim = 3 + + knl = lp.make_kernel( + "{ [c,i,j,k,ell,ell2,ell3]: \ + 0 <= c < nels and \ + 0 <= i < nbf and \ + 0 <= j < nbf and \ + 0 <= k < nqp and \ + 0 <= ell,ell2 < sdim}", + """ + dpsi(bf,k0,dir) := \ + simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) + Ael[c,i,j] = \ + J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) + """, + assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") + + print(knl) + + knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) + + ref_knl = knl + + knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) + + def variant_1(knl): + knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') + knl = lp.prioritize_loops(knl, "c,i,j") + return knl + + def variant_2(knl): + knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') + knl = lp.prioritize_loops(knl, "c,i,j") + return knl + + def add_types(knl): + return lp.add_and_infer_dtypes(knl, dict( + w=np.float32, + J=np.float32, + DPsi=np.float32, + DFinv=np.float32, + )) + + for variant in [ + #variant_1, + variant_2 + ]: + knl = variant(knl) + + lp.auto_test_vs_ref( + add_types(ref_knl), ctx, add_types(knl), + parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_isl.py b/test/test_isl.py index 3bd3d221e54df685238cfd1532d2b32662aac99f..f793b1fa99f8768ff4e2fcfaa02aa87119ffcc92 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -44,6 +44,13 @@ def test_aff_to_expr_2(): assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2) +def test_pw_aff_to_conditional_expr(): + from loopy.symbolic import pw_aff_to_expr + cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") + expr = pw_aff_to_expr(cond) + assert str(expr) == "If(i == 0, 0, -1 + i)" + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_loopy.py b/test/test_loopy.py index 4c0ff097bc59e56de6268c0ddd67bd686465b5d8..c5b423936f998b3cc2ef66b07ef3f88aa398cd17 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1105,95 +1105,215 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): +def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): + from loopy.preprocess import preprocess_kernel + from loopy.schedule import get_one_scheduled_kernel + + knl = preprocess_kernel(knl) + knl = get_one_scheduled_kernel(knl) + + from loopy.transform.save import save_and_reload_temporaries + knl = save_and_reload_temporaries(knl) + knl = get_one_scheduled_kernel(knl) + + if debug: + print(knl) + cgr = lp.generate_code_v2(knl) + print(cgr.device_code()) + print(cgr.host_code()) + 1/0 + + _, (out,) = knl(queue, out_host=True) + assert (out == out_expect).all() + + +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i]: 0<=i<8 }", + """ + for i + <>t = i + ... gbarrier + out[i] = t + end + """, seq_dependencies=True) + + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) + + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) + + +def test_save_of_private_array(ctx_factory, debug=False): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - pytest.xfail("spilling doesn't yet use local axes") + knl = lp.make_kernel( + "{ [i]: 0<=i<8 }", + """ + for i + <>t[i] = i + ... gbarrier + out[i] = t[i] + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "private") + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) + + +def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i<n and 0<=k<3 }", - """ - for i, k - ... gbarrier - <> t_private_scalar = a[k,i+1] - <> t_private_array[i % 2] = a[k,i+1] - c[k,i] = a[k,i+1] - ... gbarrier - out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] + "{ [i,j,k]: 0<=i,j,k<8 }", + """ + for i + for j + <>t[j] = j end - """, seq_dependencies=True) + ... gbarrier + for k + out[i,k] = t[k] + end + end + """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) - knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") - knl = lp.set_temporary_scope(knl, "t_private_array", "private") + knl = lp.tag_inames(knl, dict(i="g.0")) + knl = lp.set_temporary_scope(knl, "t", "private") - ref_knl = knl + save_and_reload_temporaries_test( + queue, knl, np.vstack((8 * (np.arange(8),))), debug) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - # schedule - from loopy.preprocess import preprocess_kernel - knl = preprocess_kernel(knl) +def test_save_of_private_multidim_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) - # map schedule onto host or device - print(knl) + knl = lp.set_temporary_scope(knl, "t", "private") - cgr = lp.generate_code_v2(knl) + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + save_and_reload_temporaries_test(queue, knl, result, debug) - assert len(cgr.device_programs) == 2 - print(cgr.device_code()) - print(cgr.host_code()) +def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "private") + knl = lp.tag_inames(knl, dict(i="g.0")) + + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + save_and_reload_temporaries_test(queue, knl, result, debug) -def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i<n and 0<=k<3 }", + "{ [i,j,k]: 0<=i,j,k<10 }", """ - for i, k - ... gbarrier - <> t_local[i % 8,k] = i % 8 - c[k,i] = a[k,i+1] + for i + for k + <> t_arr[k] = k + end + <> t_scalar = 1 + for j + ... gbarrier + out[j] = t_scalar + ... gbarrier + t_scalar = 10 + end ... gbarrier - out[k,i] = c[k,i] + t_local[i % 8,k] + <> flag = i == 9 + out[i] = t_arr[i] {if=flag} end """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + knl = lp.set_temporary_scope(knl, "t_arr", "private") + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) - knl = lp.set_temporary_scope(knl, "t_local", "local") + result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) - ref_knl = knl + save_and_reload_temporaries_test(queue, knl, result, debug) - knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") - # schedule - from loopy.preprocess import preprocess_kernel - knl = preprocess_kernel(knl) +def test_save_of_local_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<8 }", + """ + for i, j + <>t[2*j] = j + t[2*j+1] = j + ... gbarrier + out[i] = t[2*i] + end + """, seq_dependencies=True) - # map schedule onto host or device - print(knl) + knl = lp.set_temporary_scope(knl, "t", "local") + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) - cgr = lp.generate_code_v2(knl) + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) - assert len(cgr.device_programs) == 2 - print(cgr.device_code()) - print(cgr.host_code()) +def test_save_local_multidim_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}", + """ + for i, j, k + ... gbarrier + <> t_local[k,j] = 1 + ... gbarrier + out[k,i*2+j] = t_local[k,j] + end + """, seq_dependencies=True) - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + knl = lp.set_temporary_scope(knl, "t_local", "local") + knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) + + save_and_reload_temporaries_test(queue, knl, 1, debug) def test_global_temporary(ctx_factory): @@ -1609,6 +1729,100 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order): assert np.array_equal(a, a2) +def test_header_extract(): + knl = lp.make_kernel('{[k]: 0<=k<n}}', + """ + for k + T[k] = k**2 + end + """, + [lp.GlobalArg('T', shape=(200,), dtype=np.float32), + '...']) + + knl = lp.fix_parameters(knl, n=200) + + #test C + cknl = knl.copy(target=lp.CTarget()) + assert str(lp.generate_header(cknl)[0]) == ( + 'void loopy_kernel(float *__restrict__ T);') + + #test CUDA + cuknl = knl.copy(target=lp.CudaTarget()) + assert str(lp.generate_header(cuknl)[0]) == ( + 'extern "C" __global__ void __launch_bounds__(1) ' + 'loopy_kernel(float *__restrict__ T);') + + #test OpenCL + oclknl = knl.copy(target=lp.PyOpenCLTarget()) + assert str(lp.generate_header(oclknl)[0]) == ( + '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) ' + 'loopy_kernel(__global float *__restrict__ T);') + + +def test_scalars_with_base_storage(ctx_factory): + """ Regression test for !50 """ + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i]: 0<=i<1}", + "a = 1", + [lp.TemporaryVariable("a", dtype=np.float64, + shape=(), base_storage="base")]) + + knl(queue, out_host=True) + + +def test_tight_loop_bounds(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)") + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + evt, (out,) = knl(queue, out_host=True) + + assert (out == np.arange(10)).all() + + +def test_tight_loop_bounds_codegen(): + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)", + target=lp.OpenCLTarget()) + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + cgr = lp.generate_code_v2(knl) + #print(cgr.device_code()) + + for_loop = \ + "for (int j = " \ + "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + + assert for_loop in cgr.device_code() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 0c304b7a854579007f57ba204cbff8f440aaf5fc..c85aa80ec92eb0185d30f96b478ae37043c0d7e0 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): if 1: print("OPS") - op_poly = lp.get_op_poly(hsv) - print(lp.stringify_stats_mapping(op_poly)) + op_map = lp.get_op_map(hsv) + print(lp.stringify_stats_mapping(op_map)) print("MEM") - gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) - print(lp.stringify_stats_mapping(gmem_poly)) + gmem_map = lp.get_mem_access_map(hsv).to_bytes() + print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", diff --git a/test/test_reduction.py b/test/test_reduction.py index b78509b6318a984d117d00b1a6854d9611db80d1..820c669da494f4d8863d274120cd5c0c7eb4420f 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -214,23 +214,18 @@ def test_local_parallel_reduction(ctx_factory, size): lp.auto_test_vs_ref(ref_knl, ctx, knl) -@pytest.mark.parametrize("size", [10000]) +@pytest.mark.parametrize("size", [1000]) def test_global_parallel_reduction(ctx_factory, size): - # ctx = ctx_factory() - # queue = cl.CommandQueue(ctx) + ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ - for i - <> key = make_uint2(i, 324830944) {inames=i} - <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} - <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} - end - z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) + # Using z[0] instead of z works around a bug in ancient PyOpenCL. + z[0] = sum(i, i/13) """) - # ref_knl = knl + ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) @@ -242,42 +237,52 @@ def test_global_parallel_reduction(ctx_factory, size): knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL) knl = lp.realize_reduction(knl) + knl = lp.add_dependency( + knl, "writes:acc_i_outer", + "id:red_i_outer_arg_barrier") - #evt, (z,) = knl(queue, n=size) - - #lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref( + ref_knl, ctx, knl, parameters={"n": size}, + print_ref_code=True) -@pytest.mark.parametrize("size", [10000]) -def test_global_parallel_reduction_simpler(ctx_factory, size): +@pytest.mark.parametrize("size", [1000]) +def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") + import pyopencl.version # noqa + if cl.version.VERSION < (2016, 2): + pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( - "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", + "{[i]: 0 <= i < n }", """ - <> key = make_uint2(l+nl*g, 1234) {inames=l:g} - <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} - <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} - - <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) - - result = sum(j, tmp[j]) + for i + <> key = make_uint2(i, 324830944) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + end + z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) - ng = 50 - knl = lp.fix_parameters(knl, ng=ng) - - knl = lp.set_options(knl, write_cl=True) - ref_knl = knl - knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") - knl = lp.split_reduction_outward(knl, "l_inner") - knl = lp.tag_inames(knl, "g:g.0,j:l.0") + gsize = 128 + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + knl = lp.split_reduction_inward(knl, "i_inner_outer") + from loopy.transform.data import reduction_arg_to_subst_rule + knl = reduction_arg_to_subst_rule(knl, "i_outer") + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", + temporary_scope=lp.temp_var_scope.GLOBAL) + knl = lp.realize_reduction(knl) + knl = lp.add_dependency( + knl, "writes:acc_i_outer", + "id:red_i_outer_arg_barrier") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size}) + lp.auto_test_vs_ref( + ref_knl, ctx, knl, parameters={"n": size}) def test_argmax(ctx_factory): @@ -388,112 +393,6 @@ def test_double_sum_made_unique(ctx_factory): assert b.get() == ref -def test_fd_demo(): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ - + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ - + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") - #assumptions="n mod 16=0") - knl = lp.split_iname(knl, - "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, - "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "u", - ["i_inner", "j_inner"], - fetch_bounding_box=True) - - #n = 1000 - #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) - - knl = lp.set_options(knl, write_cl=True) - knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) - code, inf = lp.generate_code(knl) - print(code) - - assert "double" not in code - - -def test_fd_1d(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "result[i] = u[i+1]-u[i]") - - knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) - ref_knl = knl - - knl = lp.split_iname(knl, "i", 16) - knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") - knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") - knl = lp.assume(knl, "n mod 16 = 0") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=2048)) - - -def test_poisson_fem(ctx_factory): - # Stolen from Peter Coogan and Rob Kirby for FEM assembly - ctx = ctx_factory() - - nbf = 5 - nqp = 5 - sdim = 3 - - knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2,ell3]: \ - 0 <= c < nels and \ - 0 <= i < nbf and \ - 0 <= j < nbf and \ - 0 <= k < nqp and \ - 0 <= ell,ell2 < sdim}", - """ - dpsi(bf,k0,dir) := \ - simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) - Ael[c,i,j] = \ - J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) - """, - assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") - - print(knl) - - knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) - - ref_knl = knl - - knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) - - def variant_1(knl): - knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') - knl = lp.prioritize_loops(knl, "c,i,j") - return knl - - def variant_2(knl): - knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') - knl = lp.prioritize_loops(knl, "c,i,j") - return knl - - def add_types(knl): - return lp.add_and_infer_dtypes(knl, dict( - w=np.float32, - J=np.float32, - DPsi=np.float32, - DFinv=np.float32, - )) - - for variant in [ - #variant_1, - variant_2 - ]: - knl = variant(knl) - - lp.auto_test_vs_ref( - add_types(ref_knl), ctx, add_types(knl), - parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) - - if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_statistics.py b/test/test_statistics.py index 68be5b8a260858e058619c796b3836611c8d4f0f..fb502045c7b6b2c7e02d11ad3ebda3b5d13c8bda 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,8 +28,10 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp +from loopy.types import to_loopy_type import numpy as np +from pymbolic.primitives import Variable def test_op_counter_basic(): @@ -44,21 +46,22 @@ def test_op_counter_basic(): name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 - + def test_op_counter_reduction(): @@ -70,15 +73,19 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l + op_map_dtype = op_map.group_by('dtype') + f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + assert f32 == f32add + f32mul + def test_op_counter_logic(): @@ -92,15 +99,15 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -120,24 +127,25 @@ def test_op_counter_specialops(): name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m - assert f64pow == i32add == f64rsqrt == f64sin == n*m + assert f64pow == i32add == f64rsq == f64sin == n*m def test_op_counter_bitwise(): @@ -157,17 +165,17 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) - i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -196,9 +204,9 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[(np.dtype(np.float64), 'mul')] + op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) - flops = poly.eval_with_dict(value_dict) + flops = op_map.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 @@ -206,7 +214,7 @@ def test_op_counter_triangular_domain(): assert flops == 78 -def test_gmem_access_counter_basic(): +def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -220,31 +228,37 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict(params) - assert f32 == 3*n*m*l - assert f64 == 2*n*m - - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict(params) - assert f32 == n*m*l - assert f64 == n*m - - -def test_gmem_access_counter_reduction(): + f32l = mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32l += mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='b') + ].eval_with_dict(params) + f64l = mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64l += mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='load', variable='h') + ].eval_with_dict(params) + assert f32l == 3*n*m*l + assert f64l == 2*n*m + + f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), + stride=0, direction='store', variable='e') + ].eval_with_dict(params) + assert f32s == n*m*l + assert f64s == n*m + + +def test_mem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -254,23 +268,33 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_gmem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict(params) - assert f32 == 2*n*m*l + f32l = mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32l += mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='b') + ].eval_with_dict(params) + assert f32l == 2*n*m*l + + f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + assert f32s == n*l - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict(params) - assert f32 == n*l + ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] + ).to_bytes().eval_and_sum(params) + st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] + ).to_bytes().eval_and_sum(params) + assert ld_bytes == 4*f32l + assert st_bytes == 4*f32s -def test_gmem_access_counter_logic(): +def test_mem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -282,27 +306,29 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict(params) - assert f32 == 2*n*m - assert f64 == n*m - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict(params) - assert f64 == n*m + reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') + + f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), + direction='load') + ].eval_with_dict(params) + f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), + direction='load') + ].eval_with_dict(params) + f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), + direction='store') + ].eval_with_dict(params) + assert f32_g_l == 2*n*m + assert f64_g_l == n*m + assert f64_g_s == n*m -def test_gmem_access_counter_specialops(): +def test_mem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -314,33 +340,43 @@ def test_gmem_access_counter_specialops(): ], name="specialops", assumptions="n,m,l >= 1") - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict(params) + f32 = mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='b') + ].eval_with_dict(params) + f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='h') + ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict(params) + f32 = mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + f64 = mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='store', variable='e') + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m + filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g']) + #tot = lp.eval_and_sum_polys(filtered_map, params) + tot = filtered_map.eval_and_sum(params) + assert tot == n*m*l + n*m -def test_gmem_access_counter_bitwise(): +def test_mem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -357,23 +393,35 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = lp.get_gmem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[ - (np.dtype(np.int32), 'uniform', 'load') - ].eval_with_dict(params) + i32 = mem_map[lp.MemAccess('global', np.int32, + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, + stride=0, direction='load', variable='b') + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), + stride=0, direction='load', variable='h') + ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[ - (np.dtype(np.int32), 'uniform', 'store') - ].eval_with_dict(params) + i32 = mem_map[lp.MemAccess('global', np.int32, + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, + stride=0, direction='store', variable='e') + ].eval_with_dict(params) assert i32 == n*m+n*m*l -def test_gmem_access_counter_mixed(): +def test_mem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -391,35 +439,44 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = lp.get_gmem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'load') - ].eval_with_dict(params) - f32uniform = poly[ - (np.dtype(np.float32), 'uniform', 'load') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) + f64uniform = mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64uniform += mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='load', variable='h') + ].eval_with_dict(params) + f32uniform = mem_map[lp.MemAccess('global', np.float32, + stride=0, direction='load', variable='x') + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=Variable('m'), direction='load', + variable='a') + ].eval_with_dict(params) + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=Variable('m'), direction='load', + variable='b') + ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'store') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') - ].eval_with_dict(params) + f64uniform = mem_map[lp.MemAccess('global', np.float64, + stride=0, direction='store', variable='e') + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, + stride=Variable('m'), direction='store', + variable='c') + ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l -def test_gmem_access_counter_nonconsec(): +def test_mem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -435,31 +492,43 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = lp.get_gmem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'load') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, + stride=Variable('m'), direction='load', + variable='g') + ].eval_with_dict(params) + f64nonconsec += mem_map[lp.MemAccess('global', np.float64, + stride=Variable('m'), direction='load', + variable='h') + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=Variable('m')*Variable('l'), + direction='load', variable='a') + ].eval_with_dict(params) + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=Variable('m')*Variable('l'), + direction='load', variable='b') + ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'store') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') - ].eval_with_dict(params) + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, + stride=Variable('m'), direction='store', + variable='e') + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, + stride=Variable('m')*Variable('l'), + direction='store', variable='c') + ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l -def test_gmem_access_counter_consec(): +def test_mem_access_counter_consec(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -474,27 +543,36 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = lp.get_gmem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'load') - ].eval_with_dict(params) - f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict(params) + #for k in mem_map: + # print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k]) + + f64consec = mem_map[lp.MemAccess('global', np.float64, + stride=1, direction='load', variable='g') + ].eval_with_dict(params) + f64consec += mem_map[lp.MemAccess('global', np.float64, + stride=1, direction='load', variable='h') + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='a') + ].eval_with_dict(params) + f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + stride=1, direction='load', variable='b') + ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'store') - ].eval_with_dict(params) - f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict(params) + f64consec = mem_map[lp.MemAccess('global', np.float64, + stride=1, direction='store', variable='e') + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess('global', np.float32, + stride=1, direction='store', variable='c') + ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -511,15 +589,15 @@ def test_barrier_counter_nobarriers(): ], name="basic", assumptions="n,m,l >= 1") - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - sync_poly = lp.get_synchronization_poly(knl) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) + sync_map = lp.get_synchronization_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - assert len(sync_poly) == 1 - assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 + assert len(sync_map) == 1 + assert sync_map["kernel_launch"].eval_with_dict(params) == 1 def test_barrier_counter_barriers(): @@ -539,13 +617,13 @@ def test_barrier_counter_barriers(): ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, inner_tag="l.0") - poly = lp.get_synchronization_poly(knl) - print(poly) + sync_map = lp.get_synchronization_map(knl) + print(sync_map) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - barrier_count = poly["barrier_local"].eval_with_dict(params) + barrier_count = sync_map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -560,50 +638,58 @@ def test_all_counters_parallel_matmul(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", 16) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - sync_poly = lp.get_synchronization_poly(knl) - assert len(sync_poly) == 1 - assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 + sync_map = lp.get_synchronization_map(knl) + assert len(sync_map) == 2 + assert sync_map["kernel_launch"].eval_with_dict(params) == 1 + assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16 - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) f32mul = op_map[ - (np.dtype(np.float32), 'mul') + lp.Op(np.float32, 'mul') ].eval_with_dict(params) f32add = op_map[ - (np.dtype(np.float32), 'add') + lp.Op(np.float32, 'add') ].eval_with_dict(params) i32ops = op_map[ - (np.dtype(np.int32), 'add') + lp.Op(np.int32, 'add') ].eval_with_dict(params) i32ops += op_map[ - (np.dtype(np.int32), 'mul') + lp.Op(np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 - assert i32ops == n*m*l*4 + l*n*4 - subscript_map = lp.get_gmem_access_poly(knl) - f32uncoal = subscript_map[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) - f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict(params) + op_map = lp.get_mem_access_map(knl) - assert f32uncoal == n*m*l - assert f32coal == n*m*l + f32coal = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='b') + ].eval_with_dict(params) + f32coal += op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='a') + ].eval_with_dict(params) - f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict(params) + assert f32coal == n*m+m*l + + f32coal = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='store', variable='c') + ].eval_with_dict(params) assert f32coal == n*l + local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) + local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), + direction='load') + ].eval_with_dict(params) + assert local_mem_l == n*m*l*2 def test_gather_access_footprint(): knl = lp.make_kernel( @@ -637,6 +723,82 @@ def test_gather_access_footprint_2(): print(key, count(knl, footprint)) +def test_summations_and_filters(): + + knl = lp.make_kernel( + "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + [ + """ + c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] + e[i, k+1] = -g[i,k]*h[i,k+1] + """ + ], + name="basic", assumptions="n,m,l >= 1") + + knl = lp.add_and_infer_dtypes(knl, + dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + n = 512 + m = 256 + l = 128 + params = {'n': n, 'm': m, 'l': l} + + mem_map = lp.get_mem_access_map(knl) + + loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) + assert loads_a == 2*n*m*l + + global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) + assert global_stores == n*m*l + n*m + + ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] + ).to_bytes().eval_and_sum(params) + st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] + ).to_bytes().eval_and_sum(params) + assert ld_bytes == 4*n*m*l*3 + 8*n*m*2 + assert st_bytes == 4*n*m*l + 8*n*m + + # ignore stride and variable names in this map + reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') + f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') + ].eval_with_dict(params) + f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') + ].eval_with_dict(params) + assert f32lall == 3*n*m*l + assert f64lall == 2*n*m + + op_map = lp.get_op_map(knl) + #for k, v in op_map.items(): + # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) + + op_map_dtype = op_map.group_by('dtype') + f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) + i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) + assert f32 == n*m*l*3 + assert f64 == n*m + assert i32 == n*m*2 + + addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) + f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) + assert addsub_all == n*m*l + n*m*2 + assert f32ops_all == n*m*l*3 + + non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) + assert non_field == 0 + + ops_nodtype = op_map.group_by('name') + ops_noname = op_map.group_by('dtype') + mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) + f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) + assert mul_all == n*m*l + n*m + assert f64ops_all == n*m + + def func_filter(key): + return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ + key.direction == 'load' + s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) + assert s1f64l == 2*n*m + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])