diff --git a/doc/index.rst b/doc/index.rst index 7baff3249a25e69019c06802901538500c1af971..8ab62928dcdddd72902994d72f1796c9bd47b3b5 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -43,6 +43,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc ref_internals diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 0000000000000000000000000000000000000000..5a59e84282119209cc89eb18e3a4eda97725edf0 --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,193 @@ +Calling Loopy Kernels and External Functions +============================================ + +Goals of a function interface +----------------------------- + +- *FIXME: * Needs to change after the new design of program. + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ResolvedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". + +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a +:attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ResolvedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ResolvedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ResolvedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ResolvedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e48fcb31c3c5632459078db499a1068e114f9021..d93be3e58aaeafbe9298dae0c4856873a866651a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -333,7 +333,7 @@ an explicit dependency: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[i,j] = 2*out[i,j] {dep=transpose} - ... """) + ... """, name="transpose_and_dbl") ``{id=transpose}`` assigns the identifier *transpose* to the first instruction, and ``{dep=transpose}`` declares a dependency of the second @@ -342,9 +342,9 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True)) --------------------------------------------------------------------------- - KERNEL: loopy_kernel + KERNEL: transpose_and_dbl --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- @@ -394,7 +394,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -743,7 +743,7 @@ those for us: .. doctest:: - >>> glob, loc = knl.get_grid_size_upper_bounds() + >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table) >>> print(glob) (Aff("[n] -> { [(floor((127 + n)/128))] }"),) >>> print(loc) @@ -1165,7 +1165,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1186,7 +1186,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1214,8 +1214,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_linearized_kernel`: - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) + >>> prog = prog.with_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1244,10 +1246,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_linearized_kernel(knl) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions + >>> prog = prog.with_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1286,7 +1288,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -1312,7 +1314,7 @@ Now we can execute the kernel. >>> arr = cl.array.arange(queue, 16, dtype=np.int32) >>> print(arr) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - >>> evt, (out,) = knl(queue, arr=arr) + >>> evt, (out,) = prog(queue, arr=arr) >>> print(arr) [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] @@ -1549,7 +1551,7 @@ containing different types of data: ... """ ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] ... e[i, k] = g[i,k]*(2+h[i,k+1]) - ... """) + ... """, name="stats_knl") >>> knl = lp.add_and_infer_dtypes(knl, ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) @@ -1560,7 +1562,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, stats_knl) : ... Each line of output will look roughly like:: @@ -1586,12 +1588,12 @@ One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_wi >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1648,15 +1650,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1691,13 +1693,13 @@ We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dic .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1715,13 +1717,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None, None) : ... - MemAccess(None, None, None, None, store, None, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1758,12 +1760,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1773,13 +1775,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1799,12 +1801,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1813,13 +1815,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1853,14 +1855,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - kernel_launch : { 1 } + Sync(kernel_launch, stats_knl) : [l, m, n] -> { 1 } We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1913,8 +1915,8 @@ count the barriers using :func:`loopy.get_synchronization_map`: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - barrier_local : { 1000 } - kernel_launch : { 1 } + Sync(barrier_local, loopy_kernel) : { 1000 } + Sync(kernel_launch, loopy_kernel) : { 1 } Based on the kernel code printed above, we would expect each work-item to diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257bf80fdfcc3d3b978a7dca2d401c48271..d9ac1f1b22a92b138e4f6432315f281b2a894aed 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(fill)" + "print(prog)" ] }, { @@ -91,10 +91,10 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", - "! RESULT = [tr_fill]\n", + "! RESULT = tr_fill\n", "!\n", "!$loopy end" ] @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(tr_fill)" + "print(prog)" ] }, { diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 23840f09a46ab97902a8d1ed7e078a7c70d36dec..733cdaac4d9153803dcb54d5c114a33871403bbf 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -28,5 +28,5 @@ end subroutine ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", ! precompute_outer_inames="i_outer, j_outer, k_outer", ! default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0403a7ab475b3e357f18489847367c3d..2b156bdd709e8f4258492d258adb888ad16fbccd 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68ae2fc6f3b7052325fcd2378e9880e47..c7ebb75667142a8bb470b32f1d92177e135db9b2 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b63492063bfd2a9604c42dbf65b2ecb86bf..211c38049076cbe065ce847f948d724c293a032c 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 0000000000000000000000000000000000000000..49b25d6e015780789c5e56af46d47a14e4611cf8 --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,99 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +# {{{ blas callable + +class CBLASGEMV(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + mat_dtype = arg_id_to_dtype.get(0) + vec_dtype = arg_id_to_dtype.get(1) + + if mat_dtype is None or vec_dtype is None: + # types aren't specialized enough to be resolved + return self, callables_table + + if mat_dtype != vec_dtype: + raise LoopyError("GEMV requires same dtypes for matrix and " + "vector") + + if vec_dtype.numpy_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype. numpy_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV is only supported for float32 and float64 " + "types") + + return (self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: vec_dtype, + 1: vec_dtype, + -1: vec_dtype}), + callables_table) + + def with_descrs(self, arg_id_to_descr, callables_table): + mat_descr = arg_id_to_descr.get(0) + vec_descr = arg_id_to_descr.get(1) + res_descr = arg_id_to_descr.get(-1) + + if mat_descr is None or vec_descr is None or res_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert mat_descr.shape[1] == vec_descr.shape[0] + assert mat_descr.shape[0] == res_descr.shape[0] + assert len(vec_descr.shape) == len(res_descr.shape) == 1 + # handling only the easy case when stride == 1 + assert vec_descr.dim_tags[0].stride == 1 + assert mat_descr.dim_tags[1].stride == 1 + assert res_descr.dim_tags[0].stride == 1 + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from pymbolic import var + mat_descr = self.arg_id_to_descr[0] + m, n = mat_descr.shape + ecm = expression_to_code_mapper + mat, vec = insn.expression.parameters + result, = insn.assignees + + c_parameters = [var("CblasRowMajor"), + var("CblasNoTrans"), + m, n, + 1, + ecm(mat).expr, + 1, + ecm(vec).expr, + 1, + ecm(result).expr, + 1] + return (var(self.name_in_target)(*c_parameters), + False # cblas_gemv does not return anything + ) + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{:}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), + lp.GlobalArg("x", dtype=np.float64, shape=(n, )), + lp.GlobalArg("y", shape=(n, )), ...], + target=CTarget()) + +knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv")) +print(lp.generate_code_v2(knl).device_code()) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7f80175ebe82b8412a38708a5b1d32042d8061fe..d97fc3fa67adb22c17d4f60c2e4283aed727af8a 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -22,7 +22,9 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) + # map schedule onto host or device print(knl) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 41fddfdee2ddf3b670bf9770ad8c4b3ec9ea7da1..ce40487b1f41a6a591134a21eeb14113fd8be4fa 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -29,8 +29,6 @@ def transform(knl, vars, stream_dtype): def gen_code(knl): - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code() diff --git a/examples/python/sparse.py b/examples/python/sparse.py index 0e56df1bc3085976bfadd783f976fa912af45da8..b4dd07df40007db16ab588c26dfefb4aadb4b7eb 100644 --- a/examples/python/sparse.py +++ b/examples/python/sparse.py @@ -11,9 +11,9 @@ k = lp.make_kernel([ <> length = rowend - rowstart y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]]) end - """) + """, name="spmv") k = lp.add_and_infer_dtypes(k, { - "values,x": np.float64, "rowstarts,colindices": k.index_dtype + "values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype }) -print(lp.generate_code(k)[0]) +print(lp.generate_code_v2(k).device_code()) diff --git a/loopy/__init__.py b/loopy/__init__.py index 9c4bfa6d0781677d0d3da8ddcd5ac44d2f90fee5..6cabbf614e0aa3ef938972a0e8af5c168467249a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -24,13 +24,10 @@ THE SOFTWARE. from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning - +from loopy.program import iterate_over_kernels_if_given_program # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( LegacyStringInstructionTag, UseStreamingStoreTag, MemoryOrdering, memory_ordering, @@ -47,6 +44,10 @@ from loopy.kernel.data import ( TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.program import ( + Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -59,7 +60,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -115,16 +116,21 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.callable import (register_callable, + merge, inline_callable_kernel, rename_callable) +from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call + # }}} from loopy.type_inference import infer_unknown_types -from loopy.preprocess import preprocess_kernel, realize_reduction +from loopy.preprocess import (preprocess_kernel, realize_reduction, + preprocess_program, infer_arg_descr) from loopy.schedule import ( generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel) -from loopy.statistics import (ToCountMap, CountGranularity, +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map, get_synchronization_map, gather_access_footprints, - gather_access_footprint_bytes) + gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -167,6 +173,10 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", "CallableKernel", + + "Program", "make_program", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated @@ -174,9 +184,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "default_function_mangler", "single_arg_function_mangler", - - "make_kernel", "UniqueName", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", @@ -229,6 +237,13 @@ __all__ = [ "add_barrier", + "register_callable", + "merge", + + "inline_callable_kernel", "rename_callable", + + "pack_and_unpack_args_for_call", + # }}} "get_dot_dependency_graph", @@ -244,17 +259,20 @@ __all__ = [ "infer_unknown_types", - "preprocess_kernel", "realize_reduction", + "preprocess_kernel", "realize_reduction", "preprocess_program", + "infer_arg_descr", + "generate_loop_schedules", "get_one_scheduled_kernel", "get_one_linearized_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", - "get_synchronization_map", "gather_access_footprints", - "gather_access_footprint_bytes", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", + "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", @@ -294,6 +312,7 @@ __all__ = [ # {{{ set_options +@iterate_over_kernels_if_given_program def set_options(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional @@ -301,6 +320,7 @@ def set_options(kernel, *args, **kwargs): See also :class:`Options`. """ + assert isinstance(kernel, LoopKernel) if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -332,6 +352,7 @@ def set_options(kernel, *args, **kwargs): # {{{ library registration +@iterate_over_kernels_if_given_program def register_preamble_generators(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` @@ -356,6 +377,7 @@ def register_preamble_generators(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) +@iterate_over_kernels_if_given_program def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -373,6 +395,7 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) +@iterate_over_kernels_if_given_program def register_function_manglers(kernel, manglers): """ :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` @@ -438,7 +461,7 @@ class CacheMode: # {{{ make copy kernel def make_copy_kernel(new_dim_tags, old_dim_tags=None): - """Returns a :class:`LoopKernel` that changes the data layout + """Returns a :class:`loopy.Program` that changes the data layout of a variable (called "input") to the new layout specified by *new_dim_tags* from the one specified by *old_dim_tags*. *old_dim_tags* defaults to an all-C layout of the same rank diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 988e83f88c7a1f7a065813f3c1f9319695b0d97c..e3e41beef89c6796a4bef226b5f5f933f286478e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -27,6 +27,7 @@ from pytools import Record import numpy as np import loopy as lp + from loopy.diagnostic import LoopyError, AutomaticTestFailure @@ -115,7 +116,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel.get_written_variables() + is_output = kernel_arg.is_output if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( @@ -302,12 +303,10 @@ def _default_check_result(result, ref_result): if not np.allclose(ref_result, result, rtol=1e-3, atol=1e-3): l2_err = ( np.sum(np.abs(ref_result-result)**2) - / - np.sum(np.abs(ref_result)**2)) + / np.sum(np.abs(ref_result)**2)) linf_err = ( np.max(np.abs(ref_result-result)) - / - np.max(np.abs(ref_result-result))) + / np.max(np.abs(ref_result-result))) return (False, "results do not match -- (rel) l_2 err: %g, l_inf err: %g" % (l2_err, linf_err)) @@ -366,12 +365,13 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_image_support # {{{ main automatic testing entrypoint def auto_test_vs_ref( - ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, + ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, - quiet=False, blacklist_ref_vendors=[]): + quiet=False, blacklist_ref_vendors=[], ref_entrypoint=None, + test_entrypoint=None): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. @@ -383,24 +383,37 @@ def auto_test_vs_ref( import pyopencl as cl - if test_knl is None: - test_knl = ref_knl + if test_prog is None: + test_prog = ref_prog do_check = False - if len(ref_knl.args) != len(test_knl.args): - raise LoopyError("ref_knl and test_knl do not have the same number " + if ref_entrypoint is None: + if len(ref_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + ref_entrypoint = list(ref_prog.entrypoints)[0] + + if test_entrypoint is None: + if len(test_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + test_entrypoint = list(test_prog.entrypoints)[0] + + ref_prog = lp.preprocess_kernel(ref_prog) + test_prog = lp.preprocess_kernel(test_prog) + + if len(ref_prog[ref_entrypoint].args) != len(test_prog[test_entrypoint].args): + raise LoopyError("ref_prog and test_prog do not have the same number " "of arguments") - for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): + for i, (ref_arg, test_arg) in enumerate(zip(ref_prog[ref_entrypoint].args, + test_prog[test_entrypoint].args)): if ref_arg.name != test_arg.name: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): @@ -421,14 +434,15 @@ def auto_test_vs_ref( # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types - ref_knl = infer_unknown_types(ref_knl, expect_completion=True) + ref_prog = infer_unknown_types(ref_prog, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg - need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_knl.args) + need_ref_image_support = any(isinstance(arg, ImageArg) + for arg in ref_prog[ref_entrypoint].args) for dev in _enumerate_cl_devices_for_ref_test( blacklist_ref_vendors, need_ref_image_support): @@ -436,30 +450,26 @@ def auto_test_vs_ref( ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + ref_codegen_result = lp.generate_code_v2(ref_prog) - pp_ref_knl = lp.preprocess_kernel(ref_knl) - - for knl in lp.generate_loop_schedules(pp_ref_knl): - ref_sched_kernel = knl - break + ref_implemented_data_info = ref_codegen_result.implemented_data_infos[ + ref_entrypoint] logger.info("{} (ref): trying {} for the reference calculation".format( - ref_knl.name, dev)) + ref_entrypoint, dev)) - ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_code(ref_compiled.get_code())) + print(get_highlighted_code( + ref_codegen_result.device_code())) print(75*"-") - ref_kernel_info = ref_compiled.kernel_info(frozenset()) - try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, - ref_kernel_info.implemented_data_info, + make_ref_args(ref_prog[ref_entrypoint], + ref_implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -484,13 +494,13 @@ def auto_test_vs_ref( ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( - ref_knl.name, dev)) - logger.info("%s (ref): run" % ref_knl.name) + ref_entrypoint, dev)) + logger.info("%s (ref): run" % ref_entrypoint) ref_start = time() if not AUTO_TEST_SKIP_RUN: - ref_evt, _ = ref_compiled(ref_queue, **ref_args) + ref_evt, _ = ref_prog(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) @@ -498,7 +508,7 @@ def auto_test_vs_ref( ref_stop = time() ref_elapsed_wall = ref_stop-ref_start - logger.info("%s (ref): run done" % ref_knl.name) + logger.info("%s (ref): run done" % ref_entrypoint) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) @@ -521,160 +531,144 @@ def auto_test_vs_ref( from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget - if test_knl.state not in [ + if test_prog[test_entrypoint].state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED]: - if isinstance(test_knl.target, PyOpenCLTarget): - test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) - - test_knl = lp.preprocess_kernel(test_knl) + if isinstance(test_prog.target, PyOpenCLTarget): + test_prog = test_prog.copy(target=PyOpenCLTarget(ctx.devices[0])) - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) - else: - test_kernels = [test_knl] - - test_kernel_count = 0 + test_prog = lp.preprocess_kernel(test_prog) from loopy.type_inference import infer_unknown_types - for i, kernel in enumerate(test_kernels): - test_kernel_count += 1 - if test_kernel_count > max_test_kernel_count: - break - - kernel = infer_unknown_types(kernel, expect_completion=True) - compiled = CompiledKernel(ctx, kernel) + test_prog = infer_unknown_types(test_prog, expect_completion=True) + test_prog_codegen_result = lp.generate_code_v2(test_prog) + + args = make_args(test_prog[test_entrypoint], + test_prog_codegen_result.implemented_data_infos[ + test_entrypoint], + queue, ref_arg_data, parameters) + args["out_host"] = False + + if not quiet: + print(75*"-") + print("Kernel:") + print(75*"-") + if print_code: + print(get_highlighted_code( + test_prog_codegen_result.device_code())) + print(75*"-") + if dump_binary: + print(type(test_prog_codegen_result.cl_program)) + print(test_prog_codegen_result.cl_program.binaries[0]) + print(75*"-") - kernel_info = compiled.kernel_info(frozenset()) + logger.info("%s: run warmup" % (test_entrypoint)) - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) + for i in range(warmup_rounds): + if not AUTO_TEST_SKIP_RUN: + test_prog(queue, **args) - args["out_host"] = False + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - # {{{ find cl program + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - for name in dir(kernel_info.cl_kernels): - if name.startswith("__"): - continue - cl_kernel = getattr(kernel_info.cl_kernels, name) - cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) - break - else: - assert False, "could not find cl_program" + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - # }}} + need_check = False - print(type(cl_program)) - if hasattr(cl_program, "binaries"): - print(cl_program.binaries[0]) + events = [] + queue.finish() - print(75*"-") + logger.info("%s: warmup done" % (test_entrypoint)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_entrypoint)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = max(warmup_rounds, 1) - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = max(warmup_rounds, 1) + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_entrypoint)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), diff --git a/loopy/check.py b/loopy/check.py index 5804c514f95483a511a90e62dd083dcbe1ae0a74..4656abbd00cd34f3ab465fe6401b96df1e731711 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -23,15 +23,18 @@ THE SOFTWARE. from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper -from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel -from loopy.type_inference import TypeInferenceMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction +from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning, + warn_with_kernel) +from loopy.type_inference import TypeReader from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) from pytools import memoize_method from collections import defaultdict +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -87,6 +90,65 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ResolvedFunction): + return (frozenset([expr.function.name]) | + self.combine(self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values()))) + else: + return self.combine(self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_resolved(kernel): + """ Checks if all call nodes in the *kernel* expression have been + resolved. + """ + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' -- register a " + "callable corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unsupported instruction type %s." % type(insn).__name__) + # }}} @@ -97,7 +159,7 @@ def check_identifiers_in_subst_rules(knl): VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"]) -class SubscriptIndicesIsIntChecker(TypeInferenceMapper): +class SubscriptIndicesIsIntChecker(TypeReader): def map_subscript(self, expr): for idx in expr.index_tuple: type_inf_result = self.rec(idx) @@ -113,12 +175,12 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): """ Checks is every array access is of type :class:`int`. """ from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -191,6 +253,19 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns an instance of :class:`set` of all the iname tags used in + *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + from itertools import chain + iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in + kernel.all_inames()))) + return { + tag for tag in iname_tags if + isinstance(tag, UniqueTag)} + + def check_multiple_tags_allowed(kernel): """ Checks if a multiple tags of an iname are compatible. @@ -208,12 +283,14 @@ def check_multiple_tags_allowed(kernel): "tags: {}".format(iname.name, iname.tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, callables_table): """ Check if any instruction of *kernel* is within multiple inames tagged with the same hw axis tag. """ from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -226,6 +303,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = callables_table[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): """ @@ -237,9 +329,11 @@ def check_for_inactive_iname_access(kernel): if not expression_inames <= insn.within_inames: raise LoopyError( "instruction '%s' references " - "inames '%s' that the instruction does not depend on" + "inames '%s' that the instruction does not depend on in " + "the kernel '%s'" % (insn.id, - ", ".join(expression_inames - insn.within_inames))) + ", ".join(expression_inames + - insn.within_inames), kernel.name)) def check_for_unused_inames(kernel): @@ -529,7 +623,7 @@ def check_write_destinations(kernel): def check_has_schedulable_iname_nesting(kernel): from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) + get_iname_duplication_options) if not has_schedulable_iname_nesting(kernel): import itertools as it opt = get_iname_duplication_options(kernel) @@ -834,14 +928,21 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + from loopy.kernel.data import auto + if all(arg.dtype not in [None, auto] for arg in kernel.args) and ( + all(tv.dtype not in [None, auto] for tv in + kernel.temporary_variables.values())): + # only check if all types are known + check_for_integer_subscript_indices(kernel, callables_table) + + check_functions_are_resolved(kernel) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -870,7 +971,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -885,7 +987,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + callables_table) group_axes = {ax for ax, length in enumerate(group_size)} local_axes = {ax for ax, length in enumerate(local_size)} @@ -902,7 +1005,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -957,9 +1061,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table) # }}} @@ -1109,15 +1214,19 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + # FIXME `check_for_unused_hw_axes_in_insns` currently flags a problem + # in the callee if a caller kernel, at a call site, uses hardware axes + # (say `g.0` and `g.1`). It does not seem that that knowledge is + # propagated to the callee. + # check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/cli.py b/loopy/cli.py index 4230b74967fc0fa7dcb0064bb712ee9ab140b299..a7d209ae87b2120f90a8d360c3ff9eb13bc925f5 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -63,11 +63,9 @@ def main(): parser.add_argument("--target", choices=( "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") - parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") - parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() @@ -106,9 +104,11 @@ def main(): ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", + ".F90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", + ".F77": "fortran", }.get(ext) with open(args.infile) as infile_fd: infile_content = infile_fd.read() @@ -159,10 +159,7 @@ def main(): raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") - if args.name is not None: - kernel = kernel.copy(name=args.name) - - kernels = [kernel] + prg = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None @@ -179,69 +176,31 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - kernels = lp.parse_transformed_fortran( + prg = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) - if args.name is not None: - kernels = [kernel for kernel in kernels - if kernel.name == args.name] - - if not kernels: - raise RuntimeError("no kernels found (name specified: %s)" - % args.name) - else: raise RuntimeError("unknown language: '%s'" % args.lang) + if not isinstance(prg, lp.Program): + # FIXME + assert isinstance(prg, list) # of kernels + raise NotImplementedError("convert list of kernels to Program") + if args.print_ir: - for kernel in kernels: - print(kernel, file=sys.stderr) - - if args.occa_add_dummy_arg: - new_kernels = [] - for kernel in kernels: - new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) - ] + kernel.args - new_kernels.append(kernel.copy(args=new_args)) - - kernels = new_kernels - del new_kernels - - codes = [] - from loopy.codegen import generate_code - for kernel in kernels: - kernel = lp.preprocess_kernel(kernel) - code, impl_arg_info = generate_code(kernel) - codes.append(code) + print(prg, file=sys.stderr) + + prg = lp.preprocess_kernel(prg) + cgr = lp.generate_code_v2(prg) if args.outfile is not None: outfile = args.outfile else: outfile = "-" - code = "\n\n".join(codes) - - # {{{ edit code if requested - - import os - edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") - need_edit = args.edit_code - if not need_edit and edit_kernel_env is not None: - # Do not replace with "any()"--Py2.6/2.7 bug doesn't like - # comprehensions in functions with exec(). - - for k in kernels: - if edit_kernel_env.lower() in k.name.lower(): - need_edit = True - - if need_edit: - from pytools import invoke_editor - code = invoke_editor(code, filename="edit.cl") - - # }}} + code = cgr.device_code() if outfile == "-": sys.stdout.write(code) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 9bc8e079ccf0fc9239e38976f6e6e89db9aa9ff6..3c02a724b9b5c3bf5e3b3907df960fd7a1a2d178 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -20,16 +20,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.diagnostic import LoopyError, warn -from pytools import ImmutableRecord, ProcessLogger +import logging +logger = logging.getLogger(__name__) + import islpy as isl +from collections import OrderedDict +from loopy.diagnostic import LoopyError, warn +from pytools import ImmutableRecord + from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -import logging -logger = logging.getLogger(__name__) + +from loopy.symbolic import CombineMapper +from functools import reduce + +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + +from pytools import ProcessLogger, memoize_method __doc__ = """ .. currentmodule:: loopy.codegen @@ -165,6 +175,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState: """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -207,21 +218,34 @@ class CodeGenerationState: .. attribute:: schedule_index_end + .. attribute:: callables_table + + A mapping from callable names to instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: is_entrypoint + + A :class:`bool` to indicate if the code is being generated for an + entrypoint kernel + .. attribute:: codegen_cache_manager An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`. """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + callables_table, + is_entrypoint, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, schedule_index_end=None, codegen_cachemanager=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -230,6 +254,8 @@ class CodeGenerationState: self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.callables_table = callables_table + self.is_entrypoint = is_entrypoint self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -239,19 +265,24 @@ class CodeGenerationState: # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, vectorization_info=None, - is_generating_device_code=None, - gen_program_name=None, + var_subst_map=None, is_entrypoint=None, vectorization_info=None, + is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info + if is_entrypoint is None: + is_entrypoint = self.is_entrypoint + if vectorization_info is False: vectorization_info = None @@ -269,6 +300,7 @@ class CodeGenerationState: return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -278,6 +310,8 @@ class CodeGenerationState: seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + callables_table=self.callables_table, + is_entrypoint=is_entrypoint, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -389,6 +423,32 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -401,44 +461,21 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, callables_table, target, + is_entrypoint): """ :returns: a :class:`CodeGenerationResult` + + :param kernel: An instance of :class:`loopy.LoopKernel`. """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) - - if kernel.state != KernelState.LINEARIZED: + if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - # {{{ cache retrieval - - from loopy import CACHING_ENABLED - - if CACHING_ENABLED: - input_kernel = kernel - try: - result = code_gen_cache[input_kernel] - logger.debug("%s: code generation cache hit" % kernel.name) - return result - except KeyError: - pass - - # }}} - - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, callables_table) codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") @@ -454,13 +491,13 @@ def generate_code_v2(kernel): if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( - kernel.target, + target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, + target=target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, @@ -486,6 +523,7 @@ def generate_code_v2(kernel): codegen_state = CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), @@ -497,14 +535,17 @@ def generate_code_v2(kernel): var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( - kernel.target.host_program_name_prefix + target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), + callables_table=callables_table, + is_entrypoint=is_entrypoint, codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel), ) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -539,7 +580,7 @@ def generate_code_v2(kernel): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -555,10 +596,160 @@ def generate_code_v2(kernel): codegen_plog.done() + return codegen_result + + +def diverge_callee_entrypoints(program): + """ + If a kernel is both an entrypoint and a callee, then rename the callee. + """ + from loopy.program import _get_callable_ids + from pytools import UniqueNameGenerator + callable_ids = _get_callable_ids(program.callables_table, + program.entrypoints) + + new_callables = {} + renames = {} + + vng = UniqueNameGenerator(set(program.callables_table.keys())) + + for clbl_id in callable_ids & program.entrypoints: + renames[clbl_id] = vng(based_on=clbl_id) + + for name, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + from loopy.program import ( + rename_resolved_functions_in_a_single_kernel) + knl = rename_resolved_functions_in_a_single_kernel( + clbl.subkernel, renames) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + for clbl_id in callable_ids & program.entrypoints: + knl = new_callables[clbl_id].subkernel.copy(name=renames[clbl_id]) + new_callables[renames[clbl_id]] = new_callables[clbl_id].copy( + subkernel=knl) + + return program.copy(callables_table=new_callables) + + +@memoize_method +def generate_code_v2(program): + """ + Returns an instance of :class:`CodeGenerationResult`. + + :param program: An instance of :class:`loopy.Program`. + """ + + from loopy.kernel import LoopKernel + from loopy.program import make_program + from loopy.codegen.result import CodeGenerationResult + + # {{{ cache retrieval + + from loopy import CACHING_ENABLED + from loopy.preprocess import prepare_for_caching + if CACHING_ENABLED: - code_gen_cache.store_if_not_present(input_kernel, codegen_result) + input_program = prepare_for_caching(program) + try: + result = code_gen_cache[input_program] + logger.debug(f"Program with entrypoints {program.entrypoints}:" + " code generation cache hit") + return result + except KeyError: + pass - return codegen_result + # }}} + + if isinstance(program, LoopKernel): + program = make_program(program) + + from loopy.kernel import KernelState + if program.state < KernelState.PREPROCESSED: + # Note that we cannot have preprocessing separately for everyone. + # Since, now the preprocessing of each one depends on the other. + # So we check if any one of the callable kernels are not preprocesses + # then, we have to do the preprocessing of every other kernel. + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + new_callables = {} + + for name, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + from loopy.schedule import get_one_scheduled_kernel + knl = clbl.subkernel + if knl.schedule is None: + knl = get_one_scheduled_kernel( + knl, program.callables_table) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + program = program.copy(callables_table=new_callables) + + program = diverge_callee_entrypoints(program) + + host_programs = OrderedDict() + device_programs = [] + device_preambles = [] + callee_fdecls = [] + implemented_data_infos = OrderedDict() + + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + #FIXME: + # 1. Diverge the kernels which are both entrypoint and callees at this + # point. By diverge we should rename the callees in kernels. + # 2. Then pass the callee versions by saying is_entrypoint=False + cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.callables_table, program.target, func_id in + program.entrypoints) + if func_id in program.entrypoints: + assert len(cgr.host_programs) == 1 + host_programs[func_id] = cgr.host_programs[func_id] + implemented_data_infos[func_id] = cgr.implemented_data_info + else: + # FIXME: This assertion should be valid + # assert cgr.host_programs == [] + assert len(cgr.device_programs) == 1 + #FIXME: + # if isinstance(callee_prog_ast, Collection): + # for entry in callee_prog_ast.contents: + # if isinstance(entry, FunctionBody): + # callee_fdecls.append(entry.fdecl) + callee_fdecls.append(cgr.device_programs[0].ast.fdecl) + + device_programs.extend(cgr.device_programs) + device_preambles.extend(cgr.device_preambles) + + device_preambles.extend(list(in_knl_callable.generate_preambles( + program.target))) + + # adding the callee fdecls to the device_programs + device_programs = ([device_programs[0].copy( + ast=program.target.get_device_ast_builder().ast_module.Collection( + callee_fdecls+[device_programs[0].ast]))] + + device_programs[1:]) + cgr = CodeGenerationResult( + host_programs=host_programs, + device_programs=device_programs, + device_preambles=device_preambles, + implemented_data_infos=implemented_data_infos) + + if CACHING_ENABLED: + code_gen_cache.store_if_not_present(input_program, cgr) + + return cgr def generate_code(kernel, device=None): @@ -572,8 +763,13 @@ def generate_code(kernel, device=None): if len(codegen_result.device_programs) > 1: raise LoopyError("kernel passed to generate_code yielded multiple " "device programs. Use generate_code_v2.") + if len(codegen_result.host_programs) > 1: + raise LoopyError("kernel passed to generate_code yielded multiple " + "host programs. Use generate_code_v2.") + + _, implemented_data_info = codegen_result.implemented_data_infos.popitem() - return codegen_result.device_code(), codegen_result.implemented_data_info + return codegen_result.device_code(), implemented_data_info # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index f7e953d9481aee705b785406462725ea25d860fe..f65c397424b7a498ec979369f6d9ed56e9c06aeb 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from collections import OrderedDict from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( @@ -89,17 +90,21 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.callables_table) + if codegen_state.is_entrypoint: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for non-entrypoint kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) @@ -149,7 +154,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( - host_program=None, + host_programs=OrderedDict(), device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index d149eb95ab7c115b38cc1b819b1c24f7b4597170..c343483f0c60497f43cc9fde2981b3e5598b00b5 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -251,7 +251,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.callables_table) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 685df8fdec9ef0ea9e45223ceae563d943a69d79..358088922a31fe4a33d5d060c5b7194514d518c0 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -21,6 +21,7 @@ THE SOFTWARE. """ from pytools import ImmutableRecord +from collections import OrderedDict def process_preambles(preambles): @@ -76,7 +77,11 @@ class GeneratedProgram(ImmutableRecord): class CodeGenerationResult(ImmutableRecord): """ - .. attribute:: host_program + .. attribute:: host_programs + + A mapping from entrypoints of a translation unit to instances of + :class:`GeneratedProgram` intended to be run on host. + .. attribute:: device_programs A list of :class:`GeneratedProgram` instances @@ -97,7 +102,7 @@ class CodeGenerationResult(ImmutableRecord): .. attribute:: implemented_data_info a list of :class:`loopy.codegen.ImplementedDataInfo` objects. - Only added at the very end of code generation. + Only added at the very end of code generation """ @staticmethod @@ -109,12 +114,12 @@ class CodeGenerationResult(ImmutableRecord): if codegen_state.is_generating_device_code: kwargs = { - "host_program": None, "device_programs": [prg], + "host_programs": OrderedDict() } else: kwargs = { - "host_program": prg, + "host_programs": OrderedDict({codegen_state.kernel.name: prg}), "device_programs": [], } @@ -128,8 +133,9 @@ class CodeGenerationResult(ImmutableRecord): return ( "".join(preamble_codes) - + - str(self.host_program.ast)) + + "\n" + + "\n\n".join(str(hp.ast) + for hp in self.host_programs.values())) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -151,7 +157,8 @@ class CodeGenerationResult(ImmutableRecord): + "\n" + "\n\n".join(str(dp.ast) for dp in self.device_programs) + "\n\n" - + str(self.host_program.ast)) + + "\n\n".join(str(hp.ast) for hp in + self.host_programs.values())) def current_program(self, codegen_state): if codegen_state.is_generating_device_code: @@ -160,7 +167,11 @@ class CodeGenerationResult(ImmutableRecord): else: result = None else: - result = self.host_program + if self.host_programs: + host_programs = self.host_programs.copy() + _, result = host_programs.popitem() + else: + result = None if result is None: ast = codegen_state.ast_builder.ast_block_class([]) @@ -184,7 +195,15 @@ class CodeGenerationResult(ImmutableRecord): else: assert program.name == codegen_state.gen_program_name assert not program.is_device_program - return self.copy(host_program=program) + host_programs = self.host_programs.copy() + if host_programs: + e, _ = host_programs.popitem() + assert codegen_state.kernel.name == e + host_programs[e] = program + else: + host_programs[codegen_state.kernel.name] = program + return self.copy( + host_programs=host_programs) def current_ast(self, codegen_state): return self.current_program(codegen_state).ast @@ -205,7 +224,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): if not elements: return CodeGenerationResult( - host_program=None, + host_programs=OrderedDict(), device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) @@ -302,28 +321,35 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) + if (codegen_state.is_generating_device_code) or ( + codegen_state.is_entrypoint): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_programs=OrderedDict()) return codegen_result # }}} + +# vim: foldmethod=marker diff --git a/loopy/compiled.py b/loopy/compiled.py index f9313c6c95612ddba6566d7c8175d998e8312147..0fa18eacbc3a16059e06c33202c91f89cc39ef64 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -31,11 +31,11 @@ class CompiledKernel(PyOpenCLKernelExecutor): """ .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, kernel, entrypoint): from warnings import warn warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", DeprecationWarning, stacklevel=2) - super().__init__(context, kernel) + super().__init__(context, kernel, entrypoint) # }}} diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 00dc837e16ad2a414a13c1ceaf4f36f3f3fb3049..4ad7cd21c48f7abd396afb977267e30cb9f2d501 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -20,7 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + from loopy.diagnostic import LoopyError +from pytools import ProcessLogger def c_preprocess(source, defines=None, filename=None, include_paths=None): @@ -152,8 +156,9 @@ def parse_transformed_fortran(source, free_form=True, strict=True, :func:`parse_fortran`. * ``FILENAME``: the file name of the code being processed - The transform code must define ``RESULT``, conventionally a list of - kernels, which is returned from this function unmodified. + The transform code must define ``RESULT``, conventionally a list of kernels + or a :class:`loopy.Program`, which is returned from this function + unmodified. An example of *source* may look as follows:: @@ -234,11 +239,64 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] -def parse_fortran(source, filename="", free_form=True, strict=True, - seq_dependencies=None, auto_dependencies=None, target=None): +def _add_assignees_to_calls(knl, all_kernels): """ - :returns: a list of :class:`loopy.LoopKernel` objects + Returns a copy of *knl* coming from the fortran parser adjusted to the + loopy specification that written variables of a call must appear in the + assignee. + + :param knl: An instance of :class:`loopy.LoopKernel`, which have incorrect + calls to the kernels in *all_kernels* by stuffing both the input and + output arguments into parameters. + + :param all_kernels: An instance of :class:`list` of loopy kernels which + may be called by *kernel*. """ + new_insns = [] + subroutine_dict = {kernel.name: kernel for kernel in all_kernels} + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + +def parse_fortran(source, filename="", free_form=None, strict=None, + seq_dependencies=None, auto_dependencies=None, target=None): + + parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) if seq_dependencies is not None and auto_dependencies is not None: raise TypeError( @@ -251,6 +309,10 @@ def parse_fortran(source, filename="", free_form=True, strict=True, if seq_dependencies is None: seq_dependencies = True + if free_form is None: + free_form = True + if strict is None: + strict = True import logging console = logging.StreamHandler() @@ -271,7 +333,23 @@ def parse_fortran(source, filename="", free_form=True, strict=True, f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) - return f2loopy.make_kernels(seq_dependencies=seq_dependencies) + kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + + from loopy.transform.callable import merge + prog = merge(kernels) + all_kernels = [clbl.subkernel + for clbl in prog.callables_table.values()] + + for knl in all_kernels: + prog.with_kernel(_add_assignees_to_calls(knl, all_kernels)) + + if len(all_kernels) == 1: + # guesssing in the case of only one function + prog = prog.with_entrypoints(all_kernels[0].name) + + parse_plog.done() + + return prog # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py index 354a769a0f4b4762cc3e39befa8fb27723be5e72..cc93e914d0470c423812b69913a7185dca9c7b67 100644 --- a/loopy/frontend/fortran/expression.py +++ b/loopy/frontend/fortran/expression.py @@ -42,6 +42,25 @@ _and = intern("and") _or = intern("or") +def tuple_to_complex_literal(expr): + if len(expr) != 2: + raise TranslationError("complex literals must have " + "two entries") + + r, i = expr + + r = np.array(r)[()] + i = np.array(i)[()] + + dtype = (r.dtype.type(0) + i.dtype.type(0)) + if dtype == np.float32: + dtype = np.complex64 + else: + dtype = np.complex128 + + return dtype(float(r) + float(i)*1j) + + # {{{ expression parser class FortranExpressionParser(ExpressionParserBase): @@ -176,24 +195,31 @@ class FortranExpressionParser(ExpressionParserBase): left_exp, did_something = ExpressionParserBase.parse_postfix( self, pstate, min_precedence, left_exp) - if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS: - # this must be a complex literal - if len(left_exp) != 2: - raise TranslationError("complex literals must have " - "two entries") + return left_exp, did_something - r, i = left_exp + def parse_expression(self, pstate, min_precedence=0): + left_exp = self.parse_prefix(pstate) - dtype = (r.dtype.type(0) + i.dtype.type(0)) - if dtype == np.float32: - dtype = np.complex64 - else: - dtype = np.complex128 + did_something = True + while did_something: + did_something = False + if pstate.is_at_end(): + return left_exp - left_exp = dtype(float(r) + float(i)*1j) + result = self.parse_postfix( + pstate, min_precedence, left_exp) + left_exp, did_something = result - return left_exp, did_something + from pymbolic.parser import FinalizedTuple + if isinstance(left_exp, FinalizedTuple): + # View all tuples that survive parsing as complex literals + # "FinalizedTuple" indicates that this tuple was enclosed + # in parens. + return tuple_to_complex_literal(left_exp) + + return left_exp # }}} + # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 78eddfb549c4ebabbf933abd7832a449a44dbeec..22e532c6e4bf44a4989ac6ff90d75b5c939aab3c 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -35,12 +35,14 @@ from islpy import dim_type from loopy.symbolic import IdentityMapper from loopy.diagnostic import LoopyError from loopy.kernel.instruction import LegacyStringInstructionTag -from pymbolic.primitives import Wildcard +from pymbolic.primitives import (Wildcard, Slice) # {{{ subscript base shifter -class SubscriptIndexBaseShifter(IdentityMapper): +class SubscriptIndexAdjuster(IdentityMapper): + """Adjust base indices of subscripts and lengths of slices.""" + def __init__(self, scope): self.scope = scope @@ -58,21 +60,63 @@ class SubscriptIndexBaseShifter(IdentityMapper): if not isinstance(subscript, tuple): subscript = (subscript,) - subscript = list(subscript) - if len(dims) != len(subscript): raise TranslationError("inconsistent number of indices " "to '%s'" % name) + new_subscript = [] for i in range(len(dims)): if len(dims[i]) == 2: - # has a base index - subscript[i] -= dims[i][0] + # has an explicit base index + base_index, end_index = dims[i] elif len(dims[i]) == 1: - # base index is 1 implicitly - subscript[i] -= 1 + base_index = 1 + end_index, = dims[i] + + sub_i = subscript[i] + if isinstance(sub_i, Slice): + start = sub_i.start + if start is None: + start = base_index + + step = sub_i.step + if step is None: + step = 1 + + stop = sub_i.stop + if stop is None: + stop = end_index + + if step == 1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index + 1, + + step + )) + elif step == -1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index - 1, + + step + )) + + else: + # FIXME + raise NotImplementedError("Fortran slice processing for " + "non-unit strides") + + else: + sub_i = sub_i - base_index - return expr.aggregate[self.rec(tuple(subscript))] + new_subscript.append(sub_i) + + return expr.aggregate[self.rec(tuple(new_subscript))] # }}} @@ -83,9 +127,6 @@ class Scope: def __init__(self, subprogram_name, arg_names=set()): self.subprogram_name = subprogram_name - # map name to data - self.data_statements = {} - # map first letter to type self.implicit_types = {} @@ -96,7 +137,7 @@ class Scope: self.type_map = {} # map name to data - self.data = {} + self.data_map = {} self.arg_names = arg_names @@ -185,7 +226,7 @@ class Scope: expr = submap(expr) - subshift = SubscriptIndexBaseShifter(self) + subshift = SubscriptIndexAdjuster(self) expr = subshift(expr) return expr @@ -216,11 +257,16 @@ class F2LoopyTranslator(FTreeWalkerBase): self.block_nest = [] + def add_instruction(self, insn): + scope = self.scope_stack[-1] + + scope.previous_instruction_id = insn.id + scope.instructions.append(insn) + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] - new_id = intern("insn%d" % self.insn_id_counter) - self.insn_id_counter += 1 + new_id = self.get_insn_id() from loopy.kernel.data import Assignment insn = Assignment( @@ -231,8 +277,13 @@ class F2LoopyTranslator(FTreeWalkerBase): predicates=frozenset(self.conditions), tags=tuple(self.instruction_tags)) - scope.previous_instruction_id = new_id - scope.instructions.append(insn) + self.add_instruction(insn) + + def get_insn_id(self): + new_id = intern("insn%d" % self.insn_id_counter) + self.insn_id_counter += 1 + + return new_id # {{{ map_XXX functions @@ -326,7 +377,8 @@ class F2LoopyTranslator(FTreeWalkerBase): tp = self.dtype_from_stmt(node) - for name, shape in self.parse_dimension_specs(node, node.entity_decls): + for name, shape, initializer in self.parse_dimension_specs( + node, node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -335,6 +387,9 @@ class F2LoopyTranslator(FTreeWalkerBase): assert name not in scope.type_map scope.type_map[name] = tp + assert name not in scope.data_map + scope.data_map[name] = initializer + return [] map_Logical = map_type_decl # noqa: N815 @@ -346,7 +401,10 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_Dimension(self, node): scope = self.scope_stack[-1] - for name, shape in self.parse_dimension_specs(node, node.items): + for name, shape, initializer in self.parse_dimension_specs(node, node.items): + if initializer is not None: + raise LoopyError("initializer in dimension statement") + if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -435,7 +493,23 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("goto") def map_Call(self, node): - raise NotImplementedError("call") + scope = self.scope_stack[-1] + + new_id = self.get_insn_id() + + from pymbolic import var + + from loopy.kernel.data import CallInstruction + insn = CallInstruction( + (), var(node.designator)(*(scope.process_expression_for_loopy( + self.parse_expr(node, item)) for item in node.items)), + within_inames=frozenset( + scope.active_loopy_inames), + id=new_id, + predicates=frozenset(self.conditions), + tags=tuple(self.instruction_tags)) + + self.add_instruction(insn) def map_Return(self, node): raise NotImplementedError("return") @@ -443,11 +517,6 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_ArithmeticIf(self, node): raise NotImplementedError("arithmetic-if") - def map_If(self, node): - raise NotImplementedError("if") - # node.expr - # node.content[0] - def realize_conditional(self, node, context_cond=None): scope = self.scope_stack[-1] @@ -474,6 +543,15 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append(cond_expr) + def map_If(self, node): + self.realize_conditional(node, None) + + for c in node.content: + self.rec(c) + + self.conditions_data.pop() + self.conditions.pop() + def map_IfThen(self, node): self.block_nest.append("if") self.realize_conditional(node, None) @@ -672,6 +750,10 @@ class F2LoopyTranslator(FTreeWalkerBase): for arg_name in sub.arg_names: dims = sub.dim_map.get(arg_name) + if sub.data_map.get(arg_name) is not None: + raise NotImplementedError( + "initializer for argument %s" % arg_name) + if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( @@ -697,15 +779,22 @@ class F2LoopyTranslator(FTreeWalkerBase): if sub.implicit_types is None and dtype is None: continue + kwargs = {} + if sub.data_map.get(var_name) is not None: + kwargs["read_only"] = True + kwargs["address_space"] = lp.AddressSpace.PRIVATE + kwargs["initializer"] = np.array( + sub.data_map[var_name], dtype=dtype) + kernel_data.append( lp.TemporaryVariable( var_name, dtype=dtype, - shape=sub.get_loopy_shape(var_name))) + shape=sub.get_loopy_shape(var_name), + **kwargs)) # }}} - from loopy.version import MOST_RECENT_LANGUAGE_VERSION - knl = lp.make_kernel( + knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, @@ -714,11 +803,10 @@ class F2LoopyTranslator(FTreeWalkerBase): index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) - from loopy.loop import fuse_loop_domains - knl = fuse_loop_domains(knl) + from loopy.loop import merge_loop_domains + knl = merge_loop_domains(knl) knl = lp.fold_constants(knl) result.append(knl) diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index c33578dc844d1a77b084d8cf83cb5009cccc489d..0dc426572f69b7a8ce16dbc97a70f874f17954c4 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -52,7 +52,9 @@ class FTreeWalkerBase: ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)\s*" - r"(\((?P[-+*/0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*/0-9:a-zA-Z, \t]+)\))?" + r"(\s*=\s*(?P.+))?" + "$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): @@ -75,7 +77,31 @@ class FTreeWalkerBase: else: shape = None - yield name, shape + init_str = groups["initializer"] + if init_str: + init_str = init_str.replace("(/", "[") + init_str = init_str.replace("/)", "]") + init_expr = self.parse_expr(node, init_str) + + from numbers import Number + if isinstance(init_expr, Number): + initializer = init_expr + elif isinstance(init_expr, list): + for i, item in enumerate(init_expr): + if not isinstance(item, Number): + raise LoopyError("unexpected type of " + "item %d in initializer: %s" + % (i+1, type(init_expr).__name__)) + initializer = init_expr + + else: + raise LoopyError("unexpected type of initializer: %s" + % type(init_expr).__name__) + + else: + initializer = None + + yield name, shape, initializer def __call__(self, expr, *args, **kwargs): return self.rec(expr, *args, **kwargs) diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index 7f9177e0ef8430cc450cb462641b12ed1a9f9b28..a469b46489786b39516ccda58a20130de4d0e7ea 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -8,9 +8,7 @@ class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): result = lp.parse_fortran(cell) - - for knl in result: - self.shell.user_ns[knl.name] = knl + self.shell.user_ns["prog"] = result @cell_magic def transformed_fortran_kernel(self, line, cell): @@ -18,8 +16,7 @@ class LoopyMagics(Magics): cell, transform_code_context=self.shell.user_ns) - for knl in result: - self.shell.user_ns[knl.name] = knl + self.shell.user_ns["prog"] = result def load_ipython_extension(ip): diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index ec0be6f5c8bff088a6e5f1e661bf0048c3a79c83..8ed4d3d437ccb5216f2464d999bd5b24e530bae8 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -24,7 +24,7 @@ THE SOFTWARE. """ -from loopy.diagnostic import StaticValueFindingError +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type @@ -60,7 +60,30 @@ def dump_space(ls): # {{{ make_slab -def make_slab(space, iname, start, stop): +def make_slab(space, iname, start, stop, step=1): + """ + Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the + constraint ``start <= step*iname < stop``. + + :arg space: An instance of :class:`islpy._isl.Space`. + + :arg iname: + Either an instance of :class:`str` as a name of the ``iname`` or a + tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space. + + :arg start: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the lower bound of + ``step*iname``(inclusive). + + :arg stop: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the upper bound of + ``step*iname``. + + :arg step: + An instance of :class:`int`. + """ zero = isl.Aff.zero_on_domain(space) if isinstance(start, (isl.Aff, isl.PwAff)): @@ -89,13 +112,25 @@ def make_slab(space, iname, start, stop): iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) - result = (isl.BasicSet.universe(space) - # start <= iname - .add_constraint(isl.Constraint.inequality_from_aff( - iname_aff - start)) - # iname < stop - .add_constraint(isl.Constraint.inequality_from_aff( - stop-1 - iname_aff))) + if step > 0: + result = (isl.BasicSet.universe(space) + # start <= step*iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff - start)) + # step*iname < stop + .add_constraint(isl.Constraint.inequality_from_aff( + stop-1 - step*iname_aff))) + elif step < 0: + result = (isl.BasicSet.universe(space) + # start >= (-step)*iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff + start)) + # (-step)*iname > stop + .add_constraint(isl.Constraint.inequality_from_aff( + -stop-1 - step*iname_aff))) + else: + # step = 0 + raise LoopyError("0 step not allowed in make_slab.") return result @@ -427,11 +462,16 @@ def boxify(cache_manager, domain, box_inames, context): def simplify_via_aff(expr): - from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies + from loopy.symbolic import aff_to_expr, guarded_aff_from_expr, get_dependencies + from loopy.diagnostic import ExpressionToAffineConversionError + deps = get_dependencies(expr) - return aff_to_expr(aff_from_expr( - isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), - expr)) + try: + return aff_to_expr(guarded_aff_from_expr( + isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), + expr)) + except ExpressionToAffineConversionError: + return expr def project_out(set, inames): @@ -578,7 +618,7 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): def set_dim_name(obj, dt, pos, name): assert isinstance(name, str) - if isinstance(obj, isl.PwQPolynomial): + if isinstance(obj, (isl.PwQPolynomial, isl.BasicSet)): return obj.set_dim_name(dt, pos, name) elif isinstance(obj, isl.PwAff): # work around missing isl_pw_aff_set_dim_name for now. diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f563c3d8ce995771e6a2ab9c31f776c018616761..e6c05c8782b13c6b861843f20d785baba3d7937b 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -34,11 +34,8 @@ import re from pytools import UniqueNameGenerator, generate_unique_names, natsorted -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname from warnings import warn @@ -108,8 +105,9 @@ class _deprecated_KernelState_SCHEDULED: # noqa class KernelState: # noqa INITIAL = 0 - PREPROCESSED = 1 - LINEARIZED = 2 + CALLS_RESOLVED = 1 + PREPROCESSED = 2 + LINEARIZED = 3 @_deprecated_KernelState_SCHEDULED def SCHEDULED(): # pylint:disable=no-method-argument @@ -163,8 +161,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: domains - a list of :class:`islpy.BasicSet` instances - representing the :ref:`domain-tree`. + a list of :class:`islpy.BasicSet` instances representing the + :ref:`domain-tree`. .. attribute:: instructions @@ -193,7 +191,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): :class:`loopy.TemporaryVariable` instances. - .. attribute:: function_manglers .. attribute:: symbol_manglers .. attribute:: substitutions @@ -259,7 +256,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): inames=None, iname_to_tags=None, substitutions=None, - function_manglers=None, symbol_manglers=[], iname_slab_increments=None, @@ -268,7 +264,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): applied_iname_rewrites=None, cache_manager=None, - index_dtype=np.int32, + index_dtype=None, options=None, state=KernelState.INITIAL, @@ -296,16 +292,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): temporary_variables = {} if substitutions is None: substitutions = {} - if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] if iname_slab_increments is None: iname_slab_increments = {} @@ -338,6 +324,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): name: inames.get(name, Iname(name, frozenset())) for name in _get_inames_from_domains(domains)} + if index_dtype is None: + index_dtype = np.int32 + # }}} # {{{ process assumptions @@ -372,6 +361,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if state not in [ KernelState.INITIAL, + KernelState.CALLS_RESOLVED, KernelState.PREPROCESSED, KernelState.LINEARIZED, ]: @@ -415,7 +405,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): substitutions=substitutions, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, - function_manglers=function_manglers, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -429,51 +418,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling - - def mangle_function(self, identifier, arg_dtypes, ast_builder=None): - if ast_builder is None: - ast_builder = self.target.get_device_ast_builder() - - manglers = ast_builder.function_manglers() + self.function_manglers - - for mangler in manglers: - mangle_result = mangler(self, identifier, arg_dtypes) - if mangle_result is not None: - from loopy.kernel.data import CallMangleInfo - if isinstance(mangle_result, CallMangleInfo): - assert len(mangle_result.arg_dtypes) == len(arg_dtypes) - return mangle_result - - assert isinstance(mangle_result, tuple) - - from warnings import warn - warn("'%s' returned a tuple instead of a CallMangleInfo instance. " - "This is deprecated." % mangler.__name__, - DeprecationWarning) - - if len(mangle_result) == 2: - result_dtype, target_name = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=None) - - elif len(mangle_result) == 3: - result_dtype, target_name, actual_arg_dtypes = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=actual_arg_dtypes) - - else: - raise ValueError("unexpected size of tuple returned by '%s'" - % mangler.__name__) - - return None - - # }}} - # {{{ symbol mangling def mangle_symbol(self, ast_builder, identifier): @@ -547,6 +491,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): except KeyError: pass + if name in self.all_inames(): + from loopy import TemporaryVariable + return TemporaryVariable( + name=name, + dtype=self.index_dtype, + shape=()) + + try: + dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(), + name) + from loopy import ValueArg + return ValueArg(name, dtype) + except TypeError: + pass + raise ValueError("nothing known about variable '%s'" % name) @property @@ -1087,21 +1046,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of all instructions whose IDs are given - in *insn_ids*. - - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + callables_table, ignore_auto=False): + """ + Returns a tuple of (global_sizes, local_sizes), where global_sizes, + local_sizes are the grid sizes accommodating all of *insn_ids*. The grid + sizes are a dict from the axis index to the corresponding grid size. """ - - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) - all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) @@ -1112,9 +1063,49 @@ class LoopKernel(ImmutableRecordWithoutPickling): % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) + # {{{ include grid constraints due to callees + global_sizes = {} local_sizes = {} + from loopy.kernel.data import ValueArg + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import (CallableKernel, + get_kw_pos_association) + from loopy.isl_helpers import subst_into_pwaff + from loopy.symbolic import ResolvedFunction + + for insn in self.instructions: + if isinstance(insn, CallInstruction) and isinstance( + insn.expression.function, ResolvedFunction): + clbl = callables_table[insn.expression.function.name] + if isinstance(clbl, CallableKernel): + _, pos_to_kw = get_kw_pos_association(clbl.subkernel) + subst_dict = { + pos_to_kw[i]: param + for i, param in enumerate(insn.expression.parameters) + if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + gsize, lsize = ( + clbl.subkernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id + for insn in clbl.subkernel.instructions), + callables_table, ignore_auto)) + + for tgt_dict, tgt_size in [(global_sizes, gsize), + (local_sizes, lsize)]: + + for iaxis, size in tgt_size.items(): + size = subst_into_pwaff(self.assumptions.space, + size, subst_dict) + if iaxis in tgt_dict: + tgt_dict[iaxis] = tgt_dict[iaxis].max(size) + else: + tgt_dict[iaxis] = size + + # }}} + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1156,6 +1147,32 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size + return global_sizes, local_sizes + + @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False, return_dict=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, callables_table, ignore_auto=ignore_auto) + + if return_dict: + return global_sizes, local_sizes + def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1185,7 +1202,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + @memoize_method + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + callables_table, ignore_auto=False, return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1196,7 +1215,15 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, callables_table, ignore_auto, return_dict) + + if return_dict: + def dict_to_exprs(d): + from loopy.symbolic import pw_aff_to_expr + return {k: pw_aff_to_expr(v, int_ok=True) + for k, v in d.items()} + + return dict_to_exprs(grid_size), dict_to_exprs(group_size) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1204,7 +1231,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False, + return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1212,18 +1240,19 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, callables_table, + ignore_auto=False, return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. *global_size* and *local_size* are :mod:`pymbolic` expressions """ - return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto, + return_dict=return_dict) # }}} @@ -1454,14 +1483,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ Execute the :class:`LoopKernel`. """ - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.program import make_program + program = make_program(self) + return program(*args, **kwargs) # }}} @@ -1558,18 +1584,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): # resolve hash conflicts. "preamble_generators", - "function_manglers", "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash @memoize_method def __hash__(self): diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 9fd166ab8f15bdc97006c94c7d03977b64c08292..8fdcb1386ecd2873d8f511095cf1914e0dff292b 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -88,6 +88,9 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag): :class:`ComputedStrideArrayDimTag` instances may occur. """ + def depends_on(self): + raise NotImplementedError() + class FixedStrideArrayDimTag(_StrideArrayDimTagBase): """An arg dimension implementation tag for a fixed (potentially @@ -145,6 +148,14 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase): return self.copy(stride=mapper(self.stride)) + def depends_on(self): + from loopy.kernel.data import auto + from loopy.symbolic import DependencyMapper + if self.stride is auto: + return frozenset() + + return DependencyMapper(composite_leaves=auto)(self.stride) + class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): """ @@ -179,6 +190,9 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -190,6 +204,9 @@ class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class VectorArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -201,6 +218,9 @@ class VectorArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + NESTING_LEVEL_RE = re.compile(r"^N([-0-9]+)(?::(.*)|)$") PADDED_STRIDE_TAG_RE = re.compile(r"^([a-zA-Z]*)\(pad=(.*)\)$") @@ -864,6 +884,7 @@ class ArrayBase(ImmutableRecord, Taggable): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, tags=tags, **kwargs) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a9129cd9317366fd10a9c1d44cf2410e90ba554d..8a2e9cde1b670936b6545a88ff89c412463f78fd 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -23,16 +23,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript, Call from loopy.tools import intern_frozenset_of_ids, Optional -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace, ValueArg) +from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -530,9 +532,11 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) @@ -1080,6 +1084,9 @@ def parse_domains(domains, defines): result.append(dom) + if result == []: + result = [isl.BasicSet("{:}")] + return result # }}} @@ -1168,8 +1175,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() import loopy as lp - - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg if arg_name in self.all_params: return ValueArg(arg_name) @@ -1720,7 +1726,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions from loopy.match import MatchExpressionBase new_deps = [] @@ -1814,6 +1820,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@iterate_over_kernels_if_given_program def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -1882,9 +1889,211 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ slice to sub array ref + +def normalize_slice_params(slice, dimension_length): + """ + Returns the normalized slice parameters ``(start, stop, step)``. + + :arg slice: An instance of :class:`pymbolic.primitives.Slice`. + :arg dimension_length: Length of the axis being sliced. + """ + from pymbolic.primitives import Slice + assert isinstance(slice, Slice) + start, stop, step = slice.start, slice.stop, slice.step + + # {{{ defaulting parameters + + if step is None: + step = 1 + + if step == 0: + raise LoopyError("Slice cannot have 0 step size.") + + if start is None: + if step > 0: + start = 0 + else: + start = dimension_length-1 + + if stop is None: + if step > 0: + stop = dimension_length + else: + stop = -1 + + # }}} + + return start, stop, step + + +class SliceToInameReplacer(IdentityMapper): + """ + Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. + + .. attribute:: var_name_gen + + Variable name generator, in order to generate unique inames within the + kernel domain. + + .. attribute:: knl + + An instance of :class:`loopy.LoopKernel` + + .. attribute:: subarray_ref_bounds + + A :class:`list` (one entry for each :class:`SubArrayRef` to be created) + of :class:`dict` instances to store the slices enountered in the + expressions as a mapping from ``iname`` to a tuple of ``(start, stop, + step)``, which describes the boxy (i.e. affine) constraints imposed on + the ``iname`` by the corresponding slice notation its intended to + replace. + """ + def __init__(self, knl): + self.subarray_ref_bounds = [] + self.knl = knl + self.var_name_gen = knl.get_var_name_generator() + + def map_subscript(self, expr): + subscript_iname_bounds = {} + + new_index = [] + swept_inames = [] + for i, index in enumerate(expr.index_tuple): + if isinstance(index, Slice): + unique_var_name = self.var_name_gen(based_on="i") + if expr.aggregate.name in self.knl.arg_dict: + shape = self.knl.arg_dict[expr.aggregate.name].shape + else: + assert expr.aggregate.name in self.knl.temporary_variables + shape = self.knl.temporary_variables[ + expr.aggregate.name].shape + if shape is None or shape[i] is None: + raise LoopyError("Slice notation is only supported for " + "variables whose shapes are known at creation time " + "-- maybe add the shape for '{}'.".format( + expr.aggregate.name)) + + domain_length = shape[i] + start, stop, step = normalize_slice_params(index, domain_length) + subscript_iname_bounds[unique_var_name] = (start, stop, step) + new_index.append(start+step*Variable(unique_var_name)) + swept_inames.append(Variable(unique_var_name)) + else: + new_index.append(index) + + if swept_inames: + self.subarray_ref_bounds.append(subscript_iname_bounds) + result = SubArrayRef(tuple(swept_inames), Subscript( + self.rec(expr.aggregate), + self.rec(tuple(new_index)))) + else: + result = super().map_subscript(expr) + + return result + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + new_expr = self.rec(CallWithKwargs(expr.function, expr.parameters, {})) + return Call(new_expr.function, new_expr.parameters) + + def map_call_with_kwargs(self, expr): + def _convert_array_to_slices(arg): + # FIXME: We do not support something like A[1] should point to the + # second row if 'A' is 3 x 3 array. + if isinstance(arg, Variable): + from loopy.kernel.data import auto + if (arg.name in self.knl.temporary_variables): + if self.knl.temporary_variables[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, will be + # caught and raised during preprocessing). + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.temporary_variables[arg.name].shape) + elif arg.name in self.knl.arg_dict: + if isinstance(self.knl.arg_dict[arg.name], ValueArg): + array_arg_shape = () + else: + + if self.knl.arg_dict[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, will + # be caught and raised during preprocessing). + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.arg_dict[arg.name].shape) + else: + assert arg.name in self.knl.all_inames() + array_arg_shape = () + + if array_arg_shape != (): + return Subscript(arg, tuple(Slice(()) + for _ in array_arg_shape)) + return arg + + from pymbolic.primitives import CallWithKwargs + return CallWithKwargs(expr.function, + tuple(self.rec(_convert_array_to_slices(par)) + for par in expr.parameters), + {kw: self.rec(_convert_array_to_slices(par)) + for kw, par in expr.kw_parameters.items()}) + + def get_iname_domain_as_isl_set(self): + """ + Returns the extra domain constraints imposed by the slice inames, + recorded in :attr:`iname_domains`. + """ + subarray_ref_domains = [] + for sar_bounds in self.subarray_ref_bounds: + ctx = self.knl.isl_context + space = isl.Space.create_from_names(ctx, + set=list(sar_bounds.keys())) + from loopy.symbolic import get_dependencies + args_as_params_for_domains = set() + for slice_ in sar_bounds.values(): + args_as_params_for_domains |= get_dependencies(slice_) + + space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) + for i, arg in enumerate(args_as_params_for_domains): + space = space.set_dim_name(dim_type.param, i, arg) + + iname_set = isl.BasicSet.universe(space) + + from loopy.isl_helpers import make_slab + for iname, (start, stop, step) in sar_bounds.items(): + iname_set = iname_set & make_slab(space, iname, start, stop, step) + + subarray_ref_domains.append(iname_set) + + return subarray_ref_domains + + +def realize_slices_array_inputs_as_sub_array_refs(kernel): + """ + Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` + encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. + """ + slice_replacer = SliceToInameReplacer(kernel) + new_insns = [insn.with_transformed_expressions(slice_replacer) + for insn in kernel.instructions] + + return kernel.copy( + domains=( + kernel.domains + + slice_replacer.get_iname_domain_as_isl_set()), + instructions=new_insns) + +# }}} + + # {{{ kernel creation top-level -def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): +def make_function(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg domains: @@ -2047,7 +2256,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # This *is* gross. But it seems like the right thing interface-wise. import inspect - caller_globals = inspect.currentframe().f_back.f_globals + if inspect.currentframe().f_back.f_code.co_name == "make_kernel": + # if caller is "make_kernel", read globals from make_kernel's caller + caller_globals = inspect.currentframe().f_back.f_back.f_globals + else: + caller_globals = inspect.currentframe().f_back.f_globals for ver_sym in LANGUAGE_VERSION_SYMBOLS: try: @@ -2064,7 +2277,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.version import ( MOST_RECENT_LANGUAGE_VERSION, FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " + warn("'lang_version' was not passed to make_function(). " "To avoid this warning, pass " "lang_version={ver} in this invocation. " "(Or say 'from loopy.version import " @@ -2180,6 +2393,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) + + # convert slices to iname domains + knl = realize_slices_array_inputs_as_sub_array_refs(knl) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- @@ -2217,15 +2434,25 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_args_are_input_output + knl = infer_args_are_input_output(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + from loopy.program import make_program + return make_program(knl) + + +def make_kernel(*args, **kwargs): + tunit = make_function(*args, **kwargs) + name, = [name for name in tunit.callables_table] + return tunit.with_entrypoints(name) + - return knl +make_kernel.__doc__ = make_function.__doc__ # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 620211cf29b464e297119ede597bb1abadaff193..377d13e61218df92ee4236988ab6d31af9a0289e 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -363,6 +363,8 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) ImmutableRecord.__init__(self, **kwargs) @@ -375,21 +377,39 @@ class ArrayArg(ArrayBase, KernelArgument): An attribute of :class:`AddressSpace` defining the address space in which the array resides. - .. attribute:: is_output_only + .. attribute:: is_output - An instance of :class:`bool`. If set to *True*, recorded to be - returned from the kernel. + An instance of :class:`bool`. If set to *True*, the array is used to + return information to the caller. If set to *False*, the callee does not + write to the array during a call. + + .. attribute:: is_input + + An instance of :class:`bool`. If set to *True*, expected to be provided + by the caller. If *False*, the callee does not depend on the array + at kernel entry. """) allowed_extra_kwargs = [ "address_space", - "is_output_only", + "is_output", + "is_input", "tags"] def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + + is_output_only = kwargs.pop("is_output_only", None) + if is_output_only is not None: + warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'" + " instead.", DeprecationWarning, stacklevel=2) + kwargs["is_output"] = is_output_only + kwargs["is_input"] = not is_output_only + else: + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) + super().__init__(*args, **kwargs) min_target_axes = 0 @@ -416,7 +436,8 @@ class ArrayArg(ArrayBase, KernelArgument): """ super().update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, self.address_space) - key_builder.rec(key_hash, self.is_output_only) + key_builder.rec(key_hash, self.is_output) + key_builder.rec(key_hash, self.is_input) # Making this a function prevents incorrect use in isinstance. @@ -433,6 +454,17 @@ def GlobalArg(*args, **kwargs): class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + + def __init__(self, *args, **kwargs): + if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + raise LoopyError("'address_space' for ConstantArg must be GLOBAL.") + super().__init__(*args, **kwargs) + + # Constant Arg cannot be an output + is_output = False + is_input = True + address_space = AddressSpace.GLOBAL + min_target_axes = 0 max_target_axes = 1 @@ -466,7 +498,7 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument, Taggable): def __init__(self, name, dtype=None, approximately=1000, target=None, - is_output_only=False, tags=None): + is_output=False, is_input=True, tags=None): """ :arg tags: A an instance of or Iterable of instances of :class:`pytools.tag.Tag` intended for consumption by an @@ -477,7 +509,9 @@ class ValueArg(KernelArgument, Taggable): dtype=dtype, approximately=approximately, target=target, - is_output_only=is_output_only, tags=tags) + is_output=is_output, + is_input=is_input, + tags=tags) def __str__(self): import loopy as lp diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..6779a1bc75e50a56d2efa2f4a5dbaeb2e9b59e21 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,936 @@ +__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel +from loopy.kernel.array import ArrayBase +from loopy.kernel.data import ValueArg, ArrayArg +from loopy.symbolic import DependencyMapper, WalkMapper + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable + +""" + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + def map_expr(self, subst_mapper): + return self.copy() + + def depends_on(self): + return frozenset() + + update_persistent_hash = update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable. To be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used for + matching shape and address space of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of + :class:`loopy.kernel.array.ArrayDimImplementationTag` + + .. automethod:: map_expr + .. automethod:: depends_on + """ + + fields = {"shape", "address_space", "dim_tags"} + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import ArrayDimImplementationTag + from loopy.kernel.data import auto + + assert isinstance(shape, tuple) or shape in [None, auto] + assert isinstance(dim_tags, tuple) or dim_tags is None + + if dim_tags: + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in + dim_tags) + + # }}} + + super().__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + def map_expr(self, f): + """ + Returns an instance of :class:`ArrayArgDescriptor` with its shapes, strides, + mapped by *f*. + """ + if self.shape is not None: + new_shape = tuple(f(axis_len) for axis_len in self.shape) + else: + new_shape = None + + if self.dim_tags is not None: + new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) + else: + new_dim_tags = None + + return self.copy(shape=new_shape, dim_tags=new_dim_tags) + + def depends_on(self): + """ + Returns class:`frozenset` of all the variable names the + :class:`ArrayArgDescriptor` depends on. + """ + from loopy.kernel.data import auto + result = set() + + if self.shape: + dep_mapper = DependencyMapper(composite_leaves=False) + for axis_len in self.shape: + if axis_len not in [None, auto]: + result |= dep_mapper(axis_len) + + if self.dim_tags: + for dim_tag in self.dim_tags: + result |= dim_tag.depends_on() + + return frozenset(var.name for var in result) + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_pymbolic_expression(key_hash, self.shape) + key_builder.rec(key_hash, self.address_space) + key_builder.rec(key_hash, self.dim_tags) + + +class ExpressionIsScalarChecker(WalkMapper): + def __init__(self, kernel): + self.kernel = kernel + + def map_sub_array_ref(self, expr): + raise LoopyError("Sub-array refs can only be used as call's parameters" + f" or assignees. '{expr}'violates this.") + + def map_call(self, expr): + for child in expr.parameters: + self.rec(child) + + def map_call_with_kwargs(self, expr): + for child in expr.parameters + tuple(expr.kw_parameters.values()): + self.rec(child) + + def map_subscript(self, expr): + for child in expr.index_tuple: + self.rec(child) + + def map_variable(self, expr): + from loopy.kernel.data import TemporaryVariable, ArrayArg, auto + if expr.name in self.kernel.all_inames(): + # inames are scalar + return + + var = self.kernel.arg_dict.get(expr.name, None) or ( + self.kernel.temporary_variables.get(expr.name, None)) + + if var is not None: + if isinstance(var, (ArrayArg, TemporaryVariable)) and ( + var.shape != () and var.shape is not auto): + raise LoopyError("Array regions can only passed as sub-array refs.") + + def map_slice(self, expr): + raise LoopyError("Array regions can only passed as sub-array refs.") + + +def get_arg_descriptor_for_expression(kernel, expr): + """ + :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` + describing the argument expression *expr* which occurs + in a call in the code of *kernel*. + """ + from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, + SweptInameStrideCollector) + from loopy.kernel.data import TemporaryVariable, ArrayArg + + if isinstance(expr, SubArrayRef): + name = expr.subscript.aggregate.name + arg = kernel.get_var_descriptor(name) + + if not isinstance(arg, (TemporaryVariable, ArrayArg)): + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + aspace = arg.address_space + + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + sub_dim_tags = [] + sub_shape = [] + + # FIXME This blindly assumes that dim_tag has a stride and + # will not work for non-stride dim tags (e.g. vec or sep). + + # (AK) FIXME: This will almost always be nonlinear--when does this + # actually help? Maybe remove this? + # (KK) Reply: This helps in identifying identities like + # "2*(i//2) + i%2" := "i" + # See the kernel in + # test_callables.py::test_shape_translation_through_sub_array_refs + + from loopy.symbolic import simplify_using_aff + linearized_index = simplify_using_aff( + kernel, + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple))) + + strides_as_dict = SweptInameStrideCollector( + tuple(iname.name for iname in expr.swept_inames) + )(linearized_index) + sub_dim_tags = tuple( + # Not all swept inames necessarily occur in the expression. + DimTag(strides_as_dict.get(iname, 0)) + for iname in expr.swept_inames) + sub_shape = tuple( + pw_aff_to_expr( + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff + - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1 + for iname in expr.swept_inames) + + return ArrayArgDescriptor( + address_space=aspace, + dim_tags=sub_dim_tags, + shape=sub_shape) + else: + ExpressionIsScalarChecker(kernel)(expr) + return ValueArgDescriptor() + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if arg.is_output: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + if arg.is_input: + # if an argument is both input and output then kw_to_pos is + # overwritten with its expected position in the parameters + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer to + :meth:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :meth:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. note:: + + This class acts as a pseudo-callable and its significance lies in + solving picklability issues. + """ + fields = {"local_size", "global_size"} + + def __init__(self, global_size, local_size): + self.global_size = global_size + self.local_size = local_size + + def __call__(self, insn_ids, callables_table, ignore_auto=True): + return self.global_size, self.local_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within expressions in + a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types of the + callable. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. + + .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. + + - Negative "arg_id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = {"arg_id_to_dtype", "arg_id_to_descr"} + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super().__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + update_persistent_hash = update_persistent_hash + + def with_types(self, arg_id_to_dtype, callables_table): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, callables_table): + """ + :arg arg_id_to_descr: a mapping from argument identifiers (integers for + positional arguments, names for keyword arguments) to + :class:`loopy.ArrayArgDescriptor` instances. Unspecified/unknown + descriptors are not represented in *arg_id_to_descr*. + + All the expressions in arg_id_to_descr must have variables that belong + to the callable's namespace. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = {id: with_target_if_not_None(dtype) + for id, dtype in self.arg_id_to_dtype.items()} + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, global_size, local_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the program in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ + Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + + def with_added_arg(self, arg_dtype, arg_descr): + """ + Registers a new argument to the callable and returns the name of the + argument in the callable's namespace. + """ + raise NotImplementedError() + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstract interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the function and sub-classes must define it. + """ + + fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super().__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, callables_table): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, callables_table): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + + :returns: A tuple of the call to be generated and an instance of + :class:`bool` whether the first assignee is a part of the LHS in + the assignment instruction. + + .. note:: + + The default implementation returns the first assignees and the + references of the rest of the assignees are appended to the + arguments of the call. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + """ + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + first_assignee_is_returned = len(insn.assignees) > 0 + + return var(self.name_in_target)(*c_parameters), first_assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + + def with_added_arg(self, arg_dtype, arg_descr): + raise LoopyError("Cannot add args to scalar callables.") + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + + super().__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.subkernel = subkernel + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def name(self): + return self.subkernel.name + + def with_types(self, arg_id_to_dtype, callables_table): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel, callables_table = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + callables_table)) + + new_arg_id_to_dtype = {} + for pos, kw in pos_to_kw.items(): + arg = specialized_kernel.arg_dict[kw] + if arg.dtype: + new_arg_id_to_dtype[kw] = arg.dtype + new_arg_id_to_dtype[pos] = arg.dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), callables_table + + def with_descrs(self, arg_id_to_descr, callables_table): + + # arg_id_to_descr expressions provided are from the caller's namespace, + # need to register + + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + kw_to_callee_idx = {arg.name: i + for i, arg in enumerate(self.subkernel.args)} + + new_args = self.subkernel.args[:] + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + + callee_arg = new_args[kw_to_callee_idx[arg_id]] + + # {{{ checks + + if isinstance(callee_arg, ValueArg) and ( + isinstance(descr, ArrayArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be a scalar, got an array region.") + + if isinstance(callee_arg, ArrayArg) and ( + isinstance(descr, ValueArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be an array, got a scalar.") + + if (isinstance(descr, ArrayArgDescriptor) + and isinstance(callee_arg.shape, tuple) + and len(callee_arg.shape) != len(descr.shape)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}'" + " has a dimensionality mismatch, expected " + f"{len(callee_arg.shape)}, got {len(descr.shape)}") + + # }}} + + if isinstance(descr, ArrayArgDescriptor): + callee_arg = callee_arg.copy(shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + else: + # do nothing for a scalar arg. + assert isinstance(descr, ValueArgDescriptor) + + new_args[kw_to_callee_idx[arg_id]] = callee_arg + + subkernel = self.subkernel.copy(args=new_args) + + from loopy.preprocess import traverse_to_infer_arg_descr + subkernel, callables_table = ( + traverse_to_infer_arg_descr(subkernel, + callables_table)) + + # {{{ update the arg descriptors + + for arg in subkernel.args: + kw = arg.name + if isinstance(arg, ArrayBase): + arg_id_to_descr[kw] = ( + ArrayArgDescriptor(shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=arg.address_space)) + else: + assert isinstance(arg, ValueArg) + arg_id_to_descr[kw] = ValueArgDescriptor() + + arg_id_to_descr[kw_to_pos[kw]] = arg_id_to_descr[kw] + + # }}} + + return (self.copy(subkernel=subkernel, + arg_id_to_descr=arg_id_to_descr), + callables_table) + + def with_added_arg(self, arg_dtype, arg_descr): + var_name = self.subkernel.get_var_name_generator()(based_on="_lpy_arg") + + if isinstance(arg_descr, ValueArgDescriptor): + subknl = self.subkernel.copy( + args=self.subkernel.args+[ + ValueArg(var_name, arg_dtype, self.subkernel.target)]) + + kw_to_pos, pos_to_kw = get_kw_pos_association(subknl) + + if self.arg_id_to_dtype is None: + arg_id_to_dtype = {} + else: + arg_id_to_dtype = self.arg_id_to_dtype.copy() + if self.arg_id_to_descr is None: + arg_id_to_descr = {} + else: + arg_id_to_descr = self.arg_id_to_descr.copy() + + arg_id_to_dtype[var_name] = arg_dtype + arg_id_to_descr[var_name] = arg_descr + arg_id_to_dtype[kw_to_pos[var_name]] = arg_dtype + arg_id_to_descr[kw_to_pos[var_name]] = arg_descr + + return (self.copy(subkernel=subknl, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr), + var_name) + + else: + # don't think this should ever be needed + raise NotImplementedError("with_added_arg not implemented for array" + " types arguments.") + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, gsize, lsize): + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(gsize, lsize)))) + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + # FIXME Check that this is correct. + + return + yield + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # insert the assignees at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output: + if not arg.is_input: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + return var(self.subkernel.name)(*c_parameters), False + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0e56d41612cb6903100718f5dc6c20076294a1a0..6b428ae9330695a8a6883377a9d30766d8c3b3d1 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -468,7 +468,7 @@ class InstructionBase(ImmutableRecord, Taggable): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -487,13 +487,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -504,6 +511,9 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.subscript.index) - ( + frozenset(iname.name for iname in expr.swept_inames)) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -1034,9 +1044,10 @@ class CallInstruction(MultiAssignmentBase): predicates=predicates, tags=tags) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1052,9 +1063,10 @@ class CallInstruction(MultiAssignmentBase): expression = parse(expression) from pymbolic.primitives import Variable, Subscript - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef for assignee in assignees: - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, + SubArrayRef)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees @@ -1123,6 +1135,22 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result + def arg_id_to_val(self): + """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers + for positional arguments, strings for keyword args, and negative numbers + for assignees) to their respective values + """ + + from pymbolic.primitives import CallWithKwargs + arg_id_to_val = dict(enumerate(self.expression.parameters)) + if isinstance(self.expression, CallWithKwargs): + for kw, val in self.expression.kw_parameters.items(): + arg_id_to_val[kw] = val + for i, arg in enumerate(self.assignees): + arg_id_to_val[-i-1] = arg + + return arg_id_to_val + @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment @@ -1133,34 +1161,118 @@ class CallInstruction(MultiAssignmentBase): # }}} +def subscript_contains_slice(subscript): + """Return *True* if the *subscript* contains an instance of + :class:`pymbolic.primitives.Slice` as of its indices. + """ + from pymbolic.primitives import Subscript, Slice + assert isinstance(subscript, Subscript) + return any(isinstance(index, Slice) for index in subscript.index_tuple) + + +def is_array_call(assignees, expression): + """ + Returns *True* is the instruction is an array call. + + An array call is a function call applied to array type objects. If any of + the arguemnts or assignees to the function is an array, + :meth:`is_array_call` will return *True*. + """ + from pymbolic.primitives import Call, CallWithKwargs, Subscript + from loopy.symbolic import SubArrayRef + + if not isinstance(expression, (Call, CallWithKwargs)): + return False + + for par in expression.parameters+assignees: + if isinstance(par, SubArrayRef): + return True + elif isinstance(par, Subscript): + if subscript_contains_slice(par): + return True + + # did not encounter SubArrayRef/Slice, hence must be a normal call + return False + + +def modify_assignee_for_array_call(assignee): + """ + Converts the assignee subscript or variable as a SubArrayRef. + """ + from pymbolic.primitives import Subscript, Variable + from loopy.symbolic import SubArrayRef + if isinstance(assignee, SubArrayRef): + return assignee + elif isinstance(assignee, Subscript): + if subscript_contains_slice(assignee): + # Slice subscripted array are treated as SubArrayRef in the kernel + # Hence, making the behavior similar to that of `SubArrayref` + return assignee + else: + return SubArrayRef((), assignee) + elif isinstance(assignee, Variable): + return SubArrayRef((), Subscript(assignee, 0)) + else: + raise LoopyError("ArrayCall only takes Variable, Subscript or " + "SubArrayRef as its inputs") + + def make_assignment(assignees, expression, temp_var_types=None, **kwargs): + if temp_var_types is None: temp_var_types = (Optional(),) * len(assignees) - if len(assignees) == 1: + if len(assignees) != 1 or is_array_call(assignees, expression): + atomicity = kwargs.pop("atomicity", ()) + if atomicity: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, CallWithKwargs, Reduction)): + raise LoopyError("right-hand side in multiple assignment must be " + "function call or reduction, got: '%s'" % expression) + + if not is_array_call(assignees, expression): + return CallInstruction( + assignees=assignees, + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + # In the case of an array call, it is important to have each + # assignee as an instance of SubArrayRef. If not given as a + # SubArrayRef + return CallInstruction( + assignees=tuple(modify_assignee_for_array_call( + assignee) for assignee in assignees), + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + def _is_array(expr): + from loopy.symbolic import SubArrayRef + from pymbolic.primitives import (Subscript, Slice) + if isinstance(expr, SubArrayRef): + return True + if isinstance(expr, Subscript): + return any(isinstance(idx, Slice) for idx in + expr.index_tuple) + return False + + from loopy.symbolic import DependencyMapper + if any(_is_array(dep) for dep in DependencyMapper()((assignees, + expression))): + raise LoopyError("Array calls only supported as instructions" + " with function call as RHS for now.") + return Assignment( assignee=assignees[0], expression=expression, temp_var_type=temp_var_types[0], **kwargs) - atomicity = kwargs.pop("atomicity", ()) - if atomicity: - raise LoopyError("atomic operations with more than one " - "left-hand side not supported") - - from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): - raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) - - return CallInstruction( - assignees=assignees, - expression=expression, - temp_var_types=temp_var_types, - **kwargs) - # {{{ c instruction diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index e16cb59f3d3d2e3f6b116f8d13a0ea93e5268f65..5cae76192fb2d8201f3a2881a2fdb4c1d4f57438 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -32,26 +32,45 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted - +from loopy.kernel import LoopKernel +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel import logging logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(kernel, dtype_dict): +def add_dtypes(prog_or_kernel, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict) + if isinstance(prog_or_kernel, Program): + kernel_names = [clbl.subkernel.name for clbl in + prog_or_kernel.callables_table.values() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("add_dtypes may not take a Program with more than" + " one callable kernels. Please provide individual kernels" + " instead.") + + kernel_name, = kernel_names + + return prog_or_kernel.with_kernel( + add_dtypes(prog_or_kernel[kernel_name], dtype_dict)) + + assert isinstance(prog_or_kernel, LoopKernel) + + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + prog_or_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - return kernel.copy(args=new_args, temporary_variables=new_temp_vars) + return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(kernel, dtype_dict): @@ -103,7 +122,18 @@ def get_arguments_with_incomplete_dtype(kernel): if arg.dtype is None] -def add_and_infer_dtypes(kernel, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False, + kernel_name=None): + assert isinstance(prog, Program) + if kernel_name is None: + kernel_names = [clbl.subkernel.name for clbl in + prog.callables_table.values() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("Provide 'kernel_name' argument.") + + kernel_name, = kernel_names + processed_dtype_dict = {} for k, v in dtype_dict.items(): @@ -112,10 +142,10 @@ def add_and_infer_dtypes(kernel, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - kernel = add_dtypes(kernel, processed_dtype_dict) + prog = prog.with_kernel(add_dtypes(prog[kernel_name], processed_dtype_dict)) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(kernel, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(kernel, dtype_dict): @@ -463,8 +493,10 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): - """Return a string in the `dot `__ language depicting +@iterate_over_kernels_if_given_program +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): + """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -475,7 +507,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn @@ -756,7 +788,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -770,7 +802,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -793,6 +825,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if not isinstance(tag, AutoLocalIndexTagBase))) return assign_automatic_axes( kernel.copy(inames=new_inames), + callables_table, axis=recursion_axis) if axis is None: @@ -832,7 +865,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -843,6 +877,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -859,7 +894,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): new_inames = kernel.inames.copy() new_inames[iname] = kernel.inames[iname].copy(tags=new_tags) return assign_automatic_axes(kernel.copy(inames=new_inames), - axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -927,7 +962,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} @@ -1855,35 +1891,105 @@ def find_aliasing_equivalence_classes(kernel): # }}} +# {{{ callee kernel tools + +def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): + """ + Returns an instance of :class:`frozenset` of all the callee kernels + called in instructions in the *kernel* whose IDs are given in *insn_ids*. + + :arg kernel: An instance of :class:`LoopKernel`. + :arg insn_ids: An instance of :class:`frozenset`. + + If *insn_ids* is *None* returns all the callee kernels called by *kernel*. + """ + #FIXME: explain what "direct" means + + if insn_ids is None: + insn_ids = frozenset(insn.id for insn in kernel.instructions) + + def _get_callee_kernel_if_insn_has_callable_kernel(insn_id): + """Returns callee kernel if the instruction has a call to a + :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise + returns *None*. + """ + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import (CallInstruction, + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) + from pymbolic.primitives import Call + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in callables_table): + in_knl_callable = callables_table[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + return in_knl_callable.subkernel + elif isinstance(insn, (MultiAssignmentBase, + CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % + type(insn)) + + return None + + return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(insn_id) + for insn_id in insn_ids]) - frozenset([None]) + +# }}} + + # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_input_output(kernel): """ - Returns a copy of *kernel* with the attribute ``is_output_only`` set. + Returns a copy of *kernel* with the attributes ``is_input`` and + ``is_output`` of the arguments set. .. note:: - If the attribute ``is_output_only`` is not supplied from an user, then - infers it as an output argument if it is written at some point in the - kernel. + If the :attr:`~loopy.ArrayArg.is_output` is not supplied from a user, + then the array is inferred as an output argument if it is written at + some point in the kernel. + + If the :attr:`~loopy.ArrayArg.is_input` is not supplied from a user, + then the array is inferred as an input argument if it is either read at + some point in the kernel or it is neither read nor written. """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, ArrayArg): - if arg.is_output_only is not None: - assert isinstance(arg.is_output_only, bool) - new_args.append(arg) + if arg.is_output is not None: + assert isinstance(arg.is_output, bool) else: if arg.name in kernel.get_written_variables(): - new_args.append(arg.copy(is_output_only=True)) + arg = arg.copy(is_output=True) + else: + arg = arg.copy(is_output=False) + + if arg.is_input is not None: + assert isinstance(arg.is_input, bool) + else: + if arg.name in kernel.get_read_variables() or ( + (arg.name not in kernel.get_read_variables()) and ( + arg.name not in kernel.get_written_variables())): + arg = arg.copy(is_input=True) else: - new_args.append(arg.copy(is_output_only=False)) + arg = arg.copy(is_input=False) elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)): - new_args.append(arg) + pass else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + if not (arg.is_input or arg.is_output): + raise LoopyError("Kernel argument must be either input or output." + " '{}' in '{}' does not follow it.".format(arg.name, + kernel.name)) + + new_args.append(arg) + return kernel.copy(args=new_args) # }}} diff --git a/loopy/library/function.py b/loopy/library/function.py index 99af08169c0ea053a1671e0ab087f24a86c16e3b..d7558960ab0c7e2c4f045655a068fc67d0785797 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -20,38 +20,109 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result - - return None - - -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - - return None - - -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - return None +from loopy.kernel.function_interface import ScalarCallable +from loopy.diagnostic import LoopyError +from loopy.types import NumpyType +import numpy as np + + +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), callables_table) + + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = {(id, ValueArgDescriptor()): + (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()} + + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + callables_table) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + new_arg_id_to_dtype = {i: dtype + for i, dtype in arg_id_to_dtype.items() + if dtype is not None} + new_arg_id_to_dtype[-1] = NumpyType(np.int32) + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) + + def emit_call(self, expression_to_code_mapper, expression, target): + from pymbolic.primitives import Subscript + + if len(expression.parameters) != 1: + raise LoopyError("%s takes exactly one argument" % self.name) + arg, = expression.parameters + if not isinstance(arg, Subscript): + raise LoopyError( + "argument to %s must be a subscript" % self.name) + + ary = expression_to_code_mapper.find_array(arg) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + access_info = get_access_info(expression_to_code_mapper.kernel.target, + ary, arg.index, lambda expr: evaluate(expr, + expression_to_code_mapper.codegen_state.var_subst_map), + expression_to_code_mapper.codegen_state.vectorization_info) + + from loopy.kernel.data import ImageArg + if isinstance(ary, ImageArg): + raise LoopyError("%s does not support images" % self.name) + + if self.name == "indexof": + return access_info.subscripts[0] + elif self.name == "indexof_vec": + from loopy.kernel.array import VectorArrayDimTag + ivec = None + for iaxis, dim_tag in enumerate(ary.dim_tags): + if isinstance(dim_tag, VectorArrayDimTag): + ivec = iaxis + + if ivec is None: + return access_info.subscripts[0] + else: + return ( + access_info.subscripts[0]*ary.shape[ivec] + + access_info.vector_index) + + else: + raise RuntimeError("should not get here") + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + return self.emit_call( + expression_to_code_mapper, + insn.expression, + target), True + + +def get_loopy_callables(): + """ + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for functions + whose interface is provided by :mod:`loopy`. Callables that fall in this + category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ + known_callables = { + "make_tuple": MakeTupleCallable(name="make_tuple"), + "indexof": IndexOfCallable(name="indexof"), + "indexof_vec": IndexOfCallable(name="indexof_vec"), + } + + return known_callables # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 7f24dd3a0e3699fb0bb55ac1d4022645dedac854..2d4f82205904aa7dcaf27c803a56f2f3442c59be 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -26,6 +26,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -162,60 +163,86 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + fields = ScalarCallable.fields | {"target"} + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None, target=None): + + super().__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + self.target = target + + def with_types(self, arg_id_to_dtype, callables_table): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + callables_table) + + name = self.name + target = self.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + callables_table) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def get_random123_callables(target): + return {id_: Random123Callable(id_, target=target) for id_ in FUNC_NAMES_TO_RNG} # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6ca763442d3bb7e4f9044b738cb67e70aca703b1..1d53d06b063619726837f467ca12de11599a819c 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -22,11 +22,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.tools import update_persistent_hash __doc__ = """ .. currentmodule:: loopy.library.reduction @@ -50,7 +53,7 @@ class ReductionOperation: equality-comparable. """ - def result_dtypes(self, target, *arg_dtypes): + def result_dtypes(self, *arg_dtypes): """ :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type @@ -62,7 +65,7 @@ class ReductionOperation: def arg_count(self): raise NotImplementedError - def neutral_element(self, *dtypes): + def neutral_element(self, dtypes, callables_table, target): raise NotImplementedError def __hash__(self): @@ -109,10 +112,11 @@ class ScalarReductionOperation(ReductionOperation): def arg_count(self): return 1 - def result_dtypes(self, kernel, arg_dtype): + def result_dtypes(self, arg_dtype): if self.forced_result_type is not None: - return (self.parse_result_type( - kernel.target, self.forced_result_type),) + raise NotImplementedError() + # return (self.parse_result_type( + # kernel.target, self.forced_result_type),) if arg_dtype is None: return None @@ -136,29 +140,43 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 0 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f": + return 0.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 + operand2 + return 0, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 + operand2, callables_table class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 1 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f": + return 1.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 * operand2 + return 1, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 * operand2, callables_table def get_le_neutral(dtype): """Return a number y that satisfies (x <= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return var("HUGE_VAL") + elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -175,8 +193,13 @@ def get_ge_neutral(dtype): """Return a number y that satisfies (x >= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return -var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return -var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return -var("HUGE_VAL") elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -190,19 +213,47 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_ge_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_ge_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + from loopy.program import update_table + + # getting the callable 'max' from target + max_scalar_callable = target.get_device_ast_builder().known_callables["max"] + + # type specialize the callable + max_scalar_callable, callables_table = max_scalar_callable.with_types( + {0: dtype, 1: dtype}, callables_table) + + # populate callables_table + func_id, callables_table = update_table(callables_table, "max", + max_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_le_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_le_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + from loopy.program import update_table + + # getting the callable 'min' from target + min_scalar_callable = target.get_device_ast_builder().known_callables["min"] + + # type specialize the callable + min_scalar_callable, callables_table = min_scalar_callable.with_types( + {0: dtype, 1: dtype}, callables_table) + + # populate callables_table + func_id, callables_table = update_table(callables_table, "min", + min_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table # {{{ base class for symbolic reduction ops @@ -226,6 +277,10 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = update_persistent_hash # }}} @@ -257,13 +312,30 @@ class _SegmentedScalarReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, segment_flag_dtype.numpy_dtype.type.__name__) - def neutral_element(self, scalar_dtype, segment_flag_dtype): - scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, - segment_flag_dtype.numpy_dtype.type(0)) + def neutral_element(self, scalar_dtype, segment_flag_dtype, + callables_table, target): + from loopy.library.function import MakeTupleCallable + from loopy.program import update_table - def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): - return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + scalar_neutral_element, calables_table = ( + self.inner_reduction.neutral_element( + scalar_dtype, callables_table, target)) + + make_tuple_callable = MakeTupleCallable( + name="make_tuple") + + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, segment_flag_dtype])), + callables_table) + + func_id, callables_table = update_table( + callables_table, "make_tuple", make_tuple_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)), callables_table + + def result_dtypes(self, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(scalar_dtype) + (segment_flag_dtype,)) def __str__(self): @@ -273,10 +345,26 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return hash(type(self)) def __eq__(self, other): - return type(self) == type(other) + return type(self) == type(other) and (self.inner_reduction == + other.inner_reduction) - def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + segmented_scalar_callable = ReductionCallable( + SegmentedOp(self)) + + # type specialize the callable + segmented_scalar_callable, callables_table = ( + segmented_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + callables_table)) + + # populate callables_table + from loopy.program import update_table + func_id, callables_table = update_table( + callables_table, SegmentedOp(self), segmented_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -284,34 +372,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = update_persistent_hash # }}} @@ -337,15 +415,30 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, scalar_dtype, index_dtype): + def result_dtypes(self, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) - def neutral_element(self, scalar_dtype, index_dtype): + def neutral_element(self, scalar_dtype, index_dtype, callables_table, + target): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, - index_dtype.numpy_dtype.type(-1)) + + from loopy.library.function import MakeTupleCallable + from loopy.program import update_table + make_tuple_callable = MakeTupleCallable( + name="make_tuple") + + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, index_dtype])), + callables_table) + + # populate callables_table + func_id, callables_table = update_table(callables_table, "make_tuple", + make_tuple_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)), callables_table def __str__(self): return self.which @@ -360,8 +453,22 @@ class _ArgExtremumReductionOperation(ReductionOperation): def arg_count(self): return 2 - def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + arg_ext_scalar_callable = ReductionCallable(ArgExtOp(self)) + + # type specialize the callable + arg_ext_scalar_callable, callables_table = ( + arg_ext_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + callables_table)) + + # populate callables_table + from loopy.program import update_table + func_id, callables_table = update_table( + callables_table, ArgExtOp(self), arg_ext_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -369,43 +476,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = update_persistent_hash # }}} @@ -460,70 +547,86 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) - - return None +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), callables_table + + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table) + + def generate_preambles(self, target): + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {index_t} index1, + {scalar_t} op2, {index_t} index2, + {index_t} *index_out) + {{ + if (op2 {comp} op1) + {{ + *index_out = index2; + return op2; + }} + else + {{ + *index_out = index1; + return op1; + }} + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {segment_flag_t} segment_flag1, + {scalar_t} op2, {segment_flag_t} segment_flag2, + {segment_flag_t} *segment_flag_out) + {{ + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : {combined}; + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 7f5744b482fa2fb6cfbed64ee27486af9cb36e40..73ca8d72824071b36bf91798ba9a1ea14e624db7 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -22,13 +22,15 @@ THE SOFTWARE. import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that *could* be nested around them. - :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.find_loop_nest_around_map` """ result = {} @@ -52,7 +54,9 @@ def potential_loop_nest_map(kernel): return result -def fuse_loop_domains(kernel): +@iterate_over_kernels_if_given_program +def merge_loop_domains(kernel): + # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames while True: @@ -60,11 +64,13 @@ def fuse_loop_domains(kernel): parents_per_domain = kernel.parents_per_domain() all_parents_per_domain = kernel.all_parents_per_domain() + iname_to_insns = kernel.iname_to_insns() + new_domains = None for inner_iname, outer_inames in lnm.items(): for outer_iname in outer_inames: - # {{{ check if it's safe to fuse + # {{{ check if it's safe to merge inner_domain_idx = kernel.get_home_domain_index(inner_iname) outer_domain_idx = kernel.get_home_domain_index(outer_iname) @@ -72,12 +78,28 @@ def fuse_loop_domains(kernel): if inner_domain_idx == outer_domain_idx: break + if (not iname_to_insns[inner_iname] + or not iname_to_insns[outer_iname]): + # Inames without instructions occur when used in + # a SubArrayRef. We don't want monster SubArrayRef domains, + # so refuse to merge those. + continue + + if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]: + # The two inames are imperfectly nested. Domain fusion + # might be invalid when the inner loop is empty, leading to + # the outer loop also being empty. + + # FIXME: Not fully correct, does not consider reductions + # https://gitlab.tiker.net/inducer/loopy/issues/172 + continue + if ( outer_domain_idx in all_parents_per_domain[inner_domain_idx] and not outer_domain_idx == parents_per_domain[inner_domain_idx]): # Outer domain is not a direct parent of the inner - # domain. Unable to fuse. + # domain. Unable to merge. continue outer_dom = kernel.domains[outer_domain_idx] @@ -87,7 +109,7 @@ def fuse_loop_domains(kernel): if is_domain_dependent_on_inames(kernel, inner_domain_idx, outer_inames): # Bounds of inner domain depend on outer domain. - # Unable to fuse. + # Unable to merge. continue # }}} diff --git a/loopy/match.py b/loopy/match.py index 9160402b48c81e4126f0f73f8fde6f6f5406e8b4..7ecbfcfaef925890f2de9951e70feb9bf3fbbf6f 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -50,6 +50,7 @@ Match expressions .. autoclass:: Tagged .. autoclass:: Writes .. autoclass:: Reads +.. autoclass:: InKernel .. autoclass:: Iname """ @@ -74,6 +75,7 @@ _id = intern("_id") _tag = intern("_tag") _writes = intern("_writes") _reads = intern("_reads") +_in_kernel = intern("_in_kernel") _iname = intern("_iname") _whitespace = intern("_whitespace") @@ -93,13 +95,14 @@ _LEX_TABLE = [ (_tag, RE(r"tag:([\w?*]+)")), (_writes, RE(r"writes:([\w?*]+)")), (_reads, RE(r"reads:([\w?*]+)")), + (_in_kernel, RE(r"in_kernel:([\w?*]+)")), (_iname, RE(r"iname:([\w?*]+)")), (_whitespace, RE("[ \t]+")), ] -_TERMINALS = ([_id, _tag, _writes, _reads, _iname]) +_TERMINALS = ([_id, _tag, _writes, _reads, _in_kernel, _iname]) # {{{ operator precedence @@ -293,6 +296,11 @@ class Reads(GlobMatchExpressionBase): for name in matchable.read_dependency_names()) +class InKernel(GlobMatchExpressionBase): + def __call__(self, kernel, matchable): + return self.re.match(kernel.name) + + class Iname(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) @@ -330,6 +338,10 @@ def parse_match(expr): result = Reads(pstate.next_match_obj().group(1)) pstate.advance() return result + elif next_tag is _in_kernel: + result = InKernel(pstate.next_match_obj().group(1)) + pstate.advance() + return result elif next_tag is _iname: result = Iname(pstate.next_match_obj().group(1)) pstate.advance() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e59c275d29c96775c143942e6c2477b78a8a2c07..90e527ae4a29715a81608079d4fdd88025cc0abf 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -35,23 +36,34 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type from loopy.kernel.tools import kernel_has_global_barriers # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.transform.iname import remove_any_newly_unused_inames +from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper +# from loopy.transform.iname import remove_any_newly_unused_inames -import logging -logger = logging.getLogger(__name__) +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.kernel import LoopKernel +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + +from pytools import ProcessLogger +from functools import partial # {{{ prepare for caching -def prepare_for_caching(kernel): +def prepare_for_caching_inner(kernel): import loopy as lp + from loopy.types import OpaqueType new_args = [] tgt = kernel.target for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + if (dtype is not None + and not isinstance(dtype, OpaqueType) + and dtype is not lp.auto + and dtype.target is not tgt): arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt) new_args.append(arg) @@ -70,6 +82,32 @@ def prepare_for_caching(kernel): return kernel + +def prepare_for_caching(program): + if isinstance(program, LoopKernel): + return prepare_for_caching_inner(program) + + assert isinstance(program, Program) + tgt = program.target + + new_clbls = {} + for name, clbl in program.callables_table.items(): + if clbl.arg_id_to_dtype is not None: + arg_id_to_dtype = {id: dtype.with_target(tgt) + for id, dtype in clbl.arg_id_to_dtype.items()} + clbl = clbl.copy(arg_id_to_dtype=arg_id_to_dtype) + if isinstance(clbl, ScalarCallable): + pass + elif isinstance(clbl, CallableKernel): + subknl = prepare_for_caching_inner(clbl.subkernel) + clbl = clbl.copy(subkernel=subknl) + else: + raise NotImplementedError(type(clbl)) + + new_clbls[name] = clbl + + return program.copy(callables_table=new_clbls) + # }}} @@ -244,15 +282,11 @@ def find_temporary_address_space(kernel): desired_aspace_per_insn.append(desired_aspace) if not desired_aspace_per_insn: - if temp_var.initializer is None: - warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, - "temporary variable '%s' never written, eliminating" - % temp_var.name, LoopyAdvisory) - else: - raise LoopyError("temporary variable '%s': never written, " - "cannot automatically determine address space" - % temp_var.name) + warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, + "cannot automatically determine address space of '%s'" + % temp_var.name, LoopyAdvisory) + new_temp_vars[temp_var.name] = temp_var continue overall_aspace = max(desired_aspace_per_insn) @@ -741,7 +775,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): # }}} - from loopy.kernel.instruction import CallInstruction + from loopy.kernel.instruction import CallInstruction, is_array_call for insn in kernel.instructions: if not isinstance(insn, CallInstruction): continue @@ -749,6 +783,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if len(insn.assignees) <= 1: continue + if is_array_call(insn.assignees, insn.expression): + continue + assignees = insn.assignees assignee_var_names = insn.assignee_var_names() @@ -882,10 +919,21 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -@remove_any_newly_unused_inames -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +class RealizeReductionCallbackMapper(ReductionCallbackMapper): + def __init__(self, callback, callables_table): + super().__init__(callback) + self.callables_table = callables_table + + def map_reduction(self, expr, **kwargs): + result, self.callables_table = self.callback(expr, self.rec, + **kwargs) + return result + + +# @remove_any_newly_unused_inames +def realize_reduction_for_single_kernel(kernel, callables_table, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1005,7 +1053,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = insn.within_inames @@ -1037,13 +1085,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, init_id = insn_id_gen( "{}_{}_init".format(insn.id, "_".join(expr.inames))) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes) + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way @@ -1087,13 +1138,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: reduction_expr = expr.expr + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + reduction_expr, + callables_table, + kernel.target) + reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - reduction_expr), + expression=expression, depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final, @@ -1105,9 +1160,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1139,7 +1194,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1190,7 +1245,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, + callables_table=callables_table, target=kernel.target) init_id = insn_id_gen(f"{insn.id}_{red_iname}_init") init_insn = make_assignment( id=init_id, @@ -1250,17 +1306,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_expr = expr.expr transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer") + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar( + neutral_var_names, + tuple(var(nvn) for nvn in neutral_var_names)), + reduction_expr, + callables_table, + kernel.target) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar( - neutral_var_names, - tuple(var(nvn) for nvn in neutral_var_names)), - reduction_expr), + expression=expression, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -1289,22 +1348,26 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname) stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars))), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1325,9 +1388,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)] + return acc_vars[0][outer_local_iname_vars + (0,)], callables_table else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in + acc_vars], callables_table # }}} # {{{ utils (stateful) @@ -1386,7 +1450,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = insn.within_inames @@ -1423,6 +1487,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if global_barrier is not None: init_insn_depends_on |= frozenset([global_barrier]) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, @@ -1430,7 +1497,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way # of figuring out what the dependencies of the accumulator @@ -1456,13 +1523,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | {track_iname} + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs), + callables_table, + kernel.target) + scan_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - _strip_if_scalar(acc_vars, updated_inner_exprs)), + expression=expression, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, @@ -1476,25 +1547,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = insn.within_inames @@ -1552,7 +1623,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) init_insn_depends_on = insn.depends_on @@ -1660,19 +1732,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, write_stage_id = insn_id_gen( "scan_%s_write_stage_%d" % (scan_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + write_stage_insn = make_assignment( id=write_stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, read_vars), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)) - ), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1693,16 +1769,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (output_idx,)] + return (acc_vars[0][outer_local_iname_vars + (output_idx,)], + callables_table) else: return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars] + for acc_var in acc_vars], callables_table # }}} # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. @@ -1710,7 +1787,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, infer_arg_and_reduction_dtypes_for_reduction_expression) arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = insn.within_inames bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1790,7 +1867,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # to reduce over. It's rather similar to an array with () shape in # numpy.) - return expr.expr + return expr.expr, callables_table # }}} @@ -1819,15 +1896,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1846,17 +1925,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes) # }}} - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(map_reduction) + cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) insn_queue = kernel.instructions[:] insn_id_replacements = {} @@ -1885,9 +1964,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table), if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1967,13 +2049,32 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - kernel = lp.tag_inames(kernel, new_iname_tags) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, new_iname_tags) kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) - return kernel + return kernel, cb_mapper.callables_table + + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + callables_table = dict(program.callables_table) + kernels_to_scan = [in_knl_callable.subkernel + for in_knl_callable in program.callables_table.values() + if isinstance(in_knl_callable, CallableKernel)] + + for knl in kernels_to_scan: + new_knl, callables_table = realize_reduction_for_single_kernel( + knl, callables_table, *args, **kwargs) + in_knl_callable = callables_table[knl.name].copy( + subkernel=new_knl) + callables_table[knl.name] = in_knl_callable + + return program.copy(callables_table=callables_table) # }}} @@ -2043,37 +2144,254 @@ def check_atomic_loads(kernel): # }}} -preprocess_cache = WriteOncePersistentDict( - "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) +# {{{ arg_descr_inference +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Infers the :attr:`loopy` + """ -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) + def __init__(self, rule_mapping_context, caller_kernel, + callables_table): + super().__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.callables_table = callables_table + + def map_call(self, expr, expn_state, assignees=None): + from pymbolic.primitives import Call, CallWithKwargs, Variable + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ResolvedFunction + from loopy.kernel.array import ArrayBase + from loopy.kernel.data import ValueArg + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper - from loopy.kernel import KernelState - if kernel.state >= KernelState.PREPROCESSED: - return kernel + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super().map_call(expr, expn_state) - # {{{ cache retrieval + arg_id_to_val = dict(enumerate(expr.parameters)) + if isinstance(expr, CallWithKwargs): + arg_id_to_val.update(expr.kw_parameters) - from loopy import CACHING_ENABLED - if CACHING_ENABLED: - input_kernel = kernel + if assignees is not None: + # If supplied with assignees then this is a CallInstruction + for i, arg in enumerate(assignees): + arg_id_to_val[-i-1] = arg - try: - result = preprocess_cache[kernel] - logger.debug("%s: preprocess cache hit" % kernel.name) - return result - except KeyError: - pass + from loopy.kernel.function_interface import get_arg_descriptor_for_expression + arg_id_to_descr = { + arg_id: get_arg_descriptor_for_expression( + self.caller_kernel, arg) + for arg_id, arg in arg_id_to_val.items()} + in_knl_callable = self.callables_table[expr.function.name] - # }}} + # {{{ translating descriptor expressions to the callable's namespace + + deps_as_params = [] + subst_map = {} + + deps = frozenset().union(*(descr.depends_on() + for descr in arg_id_to_descr.values())) + + assert deps <= self.caller_kernel.all_variable_names() + + for dep in deps: + caller_arg = self.caller_kernel.arg_dict.get(dep, None) + caller_arg = self.caller_kernel.temporary_variables.get(dep, caller_arg) + + if not (isinstance(caller_arg, ValueArg) or (isinstance(caller_arg, + ArrayBase) and caller_arg.shape == ())): + raise NotImplementedError(f"Obtained '{dep}' as a dependency for" + f" call '{expr.function.name}' which is not a scalar.") + + in_knl_callable, callee_name = in_knl_callable.with_added_arg( + caller_arg.dtype, ValueArgDescriptor()) + + subst_map[dep] = Variable(callee_name) + deps_as_params.append(Variable(dep)) + + mapper = SubstitutionMapper(make_subst_func(subst_map)) + arg_id_to_descr = {id_: descr.map_expr(mapper) + for id_, descr in arg_id_to_descr.items()} + + # }}} + + # specializing the function according to the parameter description + new_in_knl_callable, self.callables_table = ( + in_knl_callable.with_descrs( + arg_id_to_descr, self.callables_table)) + + # find the deps of the new in kernel callablen and add those arguments to + + self.callables_table, new_func_id = ( + self.callables_table.with_callable( + expr.function.function, + new_in_knl_callable)) + + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters) + + tuple(deps_as_params)) + else: + # FIXME: Order for vars when kwargs are present? + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} + ) + + map_call_with_kwargs = map_call + + def __call__(self, expr, kernel, insn, assignees=None): + from loopy.kernel.data import InstructionBase + from loopy.symbolic import IdentityMapper, ExpansionState + assert insn is None or isinstance(insn, InstructionBase) + + return IdentityMapper.__call__(self, expr, + ExpansionState( + kernel=kernel, + instruction=insn, + stack=(), + arg_context={}), assignees=assignees) + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + mapper = partial(self, kernel=kernel, insn=insn, + assignees=insn.assignees) + new_insns.append(insn.with_transformed_expressions(mapper)) + elif isinstance(insn, MultiAssignmentBase): + mapper = partial(self, kernel=kernel, insn=insn) + new_insns.append(insn.with_transformed_expressions(mapper)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, callables_table): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + resolved functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, callables_table) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table + + +def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ + from loopy.program import make_clbl_inf_ctx, resolve_callables + from loopy.kernel.array import ArrayBase + from loopy.kernel.function_interface import (ArrayArgDescriptor, + ValueArgDescriptor) + from loopy import auto, ValueArg - logger.info("%s: preprocess start" % kernel.name) + program = resolve_callables(program) + + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + renamed_entrypoints = set() + + for e in program.entrypoints: + def _tuple_or_None(s): + if isinstance(s, tuple): + return s + elif s in [None, auto]: + return s + else: + return s, + + arg_id_to_descr = {} + for arg in program[e].args: + if isinstance(arg, ArrayBase): + if arg.shape not in (None, auto): + arg_id_to_descr[arg.name] = ArrayArgDescriptor( + _tuple_or_None(arg.shape), arg.address_space, + arg.dim_tags) + elif isinstance(arg, ValueArg): + arg_id_to_descr[arg.name] = ValueArgDescriptor() + else: + raise NotImplementedError() + new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( + arg_id_to_descr, clbl_inf_ctx) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) + renamed_entrypoints.add(new_name.name) + + return clbl_inf_ctx.finish_program(program, renamed_entrypoints) + +# }}} + + +# {{{ inline_kernels_with_gbarriers + + +def inline_kernels_with_gbarriers(program): + from loopy.kernel.instruction import BarrierInstruction + from loopy.transform.callable import inline_callable_kernel + + def has_gbarrier(knl): + return any((isinstance(insn, BarrierInstruction) + and insn.synchronization_kind == "global") + for insn in knl.instructions) + + callees_to_inline = [name for name, knl_clbl in program.callables_table.items() + if (isinstance(knl_clbl, CallableKernel) + and has_gbarrier(knl_clbl.subkernel))] + + for callee_to_inline in callees_to_inline: + program = inline_callable_kernel(program, callee_to_inline) + + return program + + +# }}} + + +preprocess_cache = WriteOncePersistentDict( + "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + +def preprocess_single_kernel(kernel, callables_table, device=None): + from loopy.kernel import KernelState + + prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) @@ -2089,20 +2407,82 @@ def preprocess_kernel(kernel, device=None): # }}} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) + # Ordering restriction: + # add_axes_to_temporaries_for_ilp because reduction accumulators + # need to be duplicated by this. + + kernel = realize_ilp(kernel) + + kernel = find_temporary_address_space(kernel) + + # check for atomic loads, much easier to do here now that the dependencies + # have been established + kernel = check_atomic_loads(kernel) + + kernel = kernel.target.preprocess(kernel) + + kernel = kernel.copy( + state=KernelState.PREPROCESSED) + + prepro_logger.done() + + return kernel + + +def preprocess_program(program, device=None): + + # {{{ cache retrieval + + from loopy import CACHING_ENABLED + if CACHING_ENABLED: + input_program = program + + try: + result = preprocess_cache[program] + logger.debug(f"program with entrypoints: {program.entrypoints}" + " preprocess cache hit") + return result + except KeyError: + pass + + # }}} + + from loopy.kernel import KernelState + if program.state >= KernelState.PREPROCESSED: + return program + + if len([clbl for clbl in program.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + program = program.with_entrypoints(",".join(clbl.name for clbl in + program.callables_table.values() if isinstance(clbl, + CallableKernel))) + + if not program.entrypoints: + raise LoopyError("Translation unit did not receive any entrypoints") + + from loopy.program import resolve_callables + program = resolve_callables(program) + + if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + from loopy.transform.subst import expand_subst + program = expand_subst(program) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel) + program = apply_single_writer_depencency_heuristic(program) # Ordering restrictions: # @@ -2113,26 +2493,44 @@ def preprocess_kernel(kernel, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel, unknown_types_ok=False) + program = realize_reduction(program, unknown_types_ok=False) - # Ordering restriction: - # add_axes_to_temporaries_for_ilp because reduction accumulators - # need to be duplicated by this. + # {{{ preprocess callable kernels - kernel = realize_ilp(kernel) + # Callable editing restrictions: + # + # - should not edit callables_table in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] + # + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.callables_table, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable).__name__)) - kernel = find_temporary_address_space(kernel) + new_callables[func_id] = in_knl_callable - # check for atomic loads, much easier to do here now that the dependencies - # have been established - kernel = check_atomic_loads(kernel) + program = program.copy(callables_table=new_callables) - kernel = kernel.target.preprocess(kernel) + # }}} - logger.info("%s: preprocess done" % kernel.name) + # infer arg descrs of the callables + program = infer_arg_descr(program) - kernel = kernel.copy( - state=KernelState.PREPROCESSED) + # Ordering restriction: + # callees with gbarrier in them must be inlined after inferrring arg_descr. + # inline_kernels_with_gbarriers does not recursively inline the callees. + program = inline_kernels_with_gbarriers(program) # {{{ prepare for caching @@ -2142,15 +2540,20 @@ def preprocess_kernel(kernel, device=None): # this target information. if CACHING_ENABLED: - input_kernel = prepare_for_caching(input_kernel) + input_program = prepare_for_caching(input_program) - kernel = prepare_for_caching(kernel) + program = prepare_for_caching(program) # }}} if CACHING_ENABLED: - preprocess_cache.store_if_not_present(input_kernel, kernel) + preprocess_cache.store_if_not_present(input_program, program) + + return program + + +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program - return kernel # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 0000000000000000000000000000000000000000..792abe59aa5f2b3660453d9f54d7c3d3dec94500 --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,793 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import re +import collections + +from pytools import ImmutableRecord +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleMappingContext) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction + +from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash +from pymbolic.primitives import Call, CallWithKwargs +from functools import reduce +from pyrsistent import pmap, PMap + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program + +.. autofunction:: make_program +.. autofunction:: iterate_over_kernels_if_given_program + +""" + + +def find_in_knl_callable_from_identifier( + function_id_to_in_knl_callable_mappers, target, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for func_id_to_in_knl_callable_mapper in ( + function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + +def _is_a_reduction_op(expr): + if isinstance(expr, ResolvedFunction): + return _is_a_reduction_op(expr.function) + + from loopy.library.reduction import ReductionOpFunction + return isinstance(expr, ReductionOpFunction) + + +class CallableResolver(RuleAwareIdentityMapper): + """ + Resolves callables in expressions and records the names of the calls + resolved. + + .. attribute:: known_callables + + An instance of :class:`frozenset` of the call names to be resolved. + + .. attribute:: rule_mapping_context + + An instance of :class:`loopy.symbolic.RuleMappingContext`. + """ + def __init__(self, rule_mapping_context, known_callables): + assert isinstance(known_callables, frozenset) + + super().__init__(rule_mapping_context) + + self.known_callables = known_callables + + # a record of the call names that were resolved + self.calls_resolved = set() + + def map_call(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + + if not _is_a_reduction_op(expr.function): + name, tag = parse_tagged_name(expr.function) + else: + if isinstance(expr.function, ResolvedFunction): + name = expr.function.function + else: + name = expr.function + + if name in self.known_callables: + params = tuple(self.rec(par, expn_state) for par in expr.parameters) + + # record that we resolved a call + self.calls_resolved.add(name) + + function = expr.function + + if not isinstance(expr.function, ResolvedFunction): + function = ResolvedFunction(expr.function) + + return Call(function, params) + + return super().map_call(expr, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + name, tag = parse_tagged_name(expr.function) + + if name in self.known_callables: + params = tuple(self.rec(par, expn_state) for par in expr.parameters) + kw_params = {kw: self.rec(par, expn_state) + for kw, par in expr.kw_parameters.items()} + + # record that we resolved a call + self.calls_resolved.add(name) + + return CallWithKwargs(ResolvedFunction(expr.function), params, kw_params) + + return super().map_call_with_kwargs(expr, expn_state) + + +# {{{ program + +class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: entrypoints + + A :class:`frozenset` of the names of the kernels which + could be called from the host. + + .. attribute:: callables_table + + An instance of :class:`pyrsistent.PMap` mapping the function + identifiers in a kernel to their associated instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A :class:`frozenset` of functions of the signature ``(target: + TargetBase, function_indentifier: str)`` that would return an instance + of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommended to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. method:: __getitem__ + + Look up the resolved callable with identifier *name*. + """ + def __init__(self, + entrypoints=frozenset(), + callables_table=pmap(), + target=None, + func_id_to_in_knl_callable_mappers=[]): + + # {{{ sanity checks + + assert isinstance(callables_table, collections.abc.Mapping) + assert isinstance(entrypoints, frozenset) + + if not isinstance(callables_table, PMap): + callables_table = pmap(callables_table) + + # }}} + + super().__init__( + entrypoints=entrypoints, + callables_table=pmap(callables_table), + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "entrypoints", + "callables_table", + "target",) + + update_persistent_hash = update_persistent_hash + + def copy(self, **kwargs): + target = kwargs.pop("target", None) + program = super().copy(**kwargs) + if target: + from loopy.kernel import KernelState + if max(callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)) > ( + KernelState.INITIAL): + if not isinstance(target, type(self.target)): + raise LoopyError("One of the kernels in the program has been " + "preprocessed, cannot modify target now.") + + new_callables = {} + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + knl = knl.copy(target=target) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + new_callables[func_id] = clbl + + program = super().copy( + callables_table=new_callables, target=target) + + return program + + def with_entrypoints(self, entrypoints): + """ + :param entrypoints: Either a comma-separated :class:`str` or + :class:`frozenset`. + """ + if isinstance(entrypoints, str): + entrypoints = frozenset([e.strip() for e in + entrypoints.split(",")]) + + assert isinstance(entrypoints, frozenset) + + return self.copy(entrypoints=entrypoints) + + @property + def state(self): + """ Returns an instance of :class:`loopy.kernel.KernelState`. """ + return min(callable_knl.subkernel.state for callable_knl in + self.callables_table.values() if + isinstance(callable_knl, CallableKernel)) + + def with_kernel(self, kernel): + """ + If *self* contains a callable kernel with *kernel*'s name, replaces its + subkernel and returns a copy of *self*. Else records a new callable + kernel with *kernel* as its subkernel. + + :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :returns: Copy of *self* with updated callable kernels. + """ + if kernel.name in self.callables_table: + # update the callable kernel + new_in_knl_callable = self.callables_table[kernel.name].copy( + subkernel=kernel) + new_callables = self.callables_table.remove(kernel.name).set( + kernel.name, new_in_knl_callable) + return self.copy(callables_table=new_callables) + else: + # add a new callable kernel + clbl = CallableKernel(kernel) + new_callables = self.callables_table.set(kernel.name, clbl) + return self.copy(callables_table=new_callables) + + def __getitem__(self, name): + result = self.callables_table[name] + if isinstance(result, CallableKernel): + return result.subkernel + else: + return result + + def __call__(self, *args, **kwargs): + entrypoint = kwargs.get("entrypoint", None) + + if entrypoint is None: + # did not receive an entrypoint for the program to execute + if len(self.entrypoints) == 1: + entrypoint, = list(self.entrypoints) + else: + raise TypeError("Program.__call__() missing 1 required" + " keyword argument: 'entrypoint'") + + if entrypoint not in self.entrypoints: + raise LoopyError("'{}' not in list possible entrypoints supplied to" + " the program. Maybe you want to invoke 'with_entrypoints'" + " before calling the program.".format(entrypoint)) + + kwargs["entrypoint"] = entrypoint + + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + # FIXME: do a topological sort by the call graph + + def strify_callable(clbl): + return str(clbl.subkernel) + + return "\n".join( + strify_callable(clbl) + for name, clbl in self.callables_table.items() + if isinstance(clbl, CallableKernel)) + + def __setstate__(self, state_obj): + super().__setstate__(state_obj) + + self._program_executor_cache = {} + + def __hash__(self): + from loopy.tools import LoopyKeyBuilder + from pytools.persistent_dict import new_hash + key_hash = new_hash() + self.update_persistent_hash(key_hash, LoopyKeyBuilder()) + return hash(key_hash.digest()) + +# }}} + + +def next_indexed_function_identifier(function_id): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``'sin_0'`` will return ``'sin_1'``. + + :arg function_id: Either an instance of :class:`str`. + """ + + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function_id) + + if match is None: + if function_id[-1] == "_": + return f"{function_id}0" + else: + return f"{function_id}_0" + + return "{alpha}_{num}".format(alpha=match.group("alpha"), + num=int(match.group("num"))+1) + + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ + def __init__(self, rule_mapping_context, renaming_dict): + super().__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super().map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + +class CallablesIDCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([expr.name]) + + def map_constant(self, expr): + return frozenset() + + def map_kernel(self, kernel): + callables_in_insn = frozenset() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_in_insn = callables_in_insn | ( + self(insn.expression)) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn).__name__) + + for rule in kernel.substitutions.values(): + callables_in_insn = callables_in_insn | ( + self(rule.expression)) + + return callables_in_insn + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def _get_callable_ids_for_knl(knl, callables): + clbl_id_collector = CallablesIDCollector() + + return frozenset().union(*( + _get_callable_ids_for_knl(callables[clbl].subkernel, callables) | + frozenset([clbl]) if isinstance(callables[clbl], CallableKernel) else + frozenset([clbl]) + for clbl in clbl_id_collector.map_kernel(knl))) + + +def _get_callable_ids(callables, entrypoints): + return frozenset().union(*( + _get_callable_ids_for_knl(callables[e].subkernel, callables) for e in + entrypoints)) + + +def make_clbl_inf_ctx(callables, entrypoints): + return CallablesInferenceContext(callables, _get_callable_ids(callables, + entrypoints)) + + +class CallablesInferenceContext(ImmutableRecord): + def __init__(self, callables, old_callable_ids, history={}): + assert isinstance(callables, collections.abc.Mapping) + callables = dict(callables) + + super().__init__( + callables=callables, + old_callable_ids=old_callable_ids, + history=history) + + # {{{ interface to perform edits on callables + + def with_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callable: An instance of + :class:`loopy.InKernelCallable`. + """ + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.callables.values(): + # the callable already exists, hence return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.callables.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = function.name + if isinstance(func_id, str): + return ( + self.copy( + history=history), + Variable(func_id)) + else: + assert isinstance(func_id, ReductionOpFunction) + return ( + self.copy( + history=history), + func_id) + + assert False + else: + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + # FIXME: Check if we have 2 ArgMax functions + # with different types in the same kernel the generated code + # does not mess up the types. + unique_function_identifier = function.copy() + updated_callables = self.callables.copy() + updated_callables[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + callables=updated_callables), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + + while unique_function_identifier in self.callables: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_callables = self.callables.copy() + updated_callables[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = function.name + + return ( + self.copy( + history=history, + callables=updated_callables), + Variable(unique_function_identifier)) + + def finish_program(self, program, renamed_entrypoints): + """ + Returns a copy of *program* with renaming of the callables done whenever + needed. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the callable + is renamed back to ``sin``. + + :param renamed_entrypoints: A :class:`frozenset` of the names of the + renamed callable kernels which correspond to the entrypoints in + *self.callables_table*. + """ + assert len(renamed_entrypoints) == len(program.entrypoints) + new_callable_ids = _get_callable_ids(self.callables, renamed_entrypoints) + + callees_with_entrypoint_names = (program.entrypoints & + new_callable_ids) - renamed_entrypoints + + renames = {} + new_callables = {} + + for c in callees_with_entrypoint_names: + unique_function_identifier = c + + while unique_function_identifier in self.callables: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + renames[c] = unique_function_identifier + + # we should perform a rewrite here. + + for e in renamed_entrypoints: + renames[e] = self.history[e] + assert renames[e] in program.entrypoints + + # {{{ calculate the renames needed + + for old_func_id in ((self.old_callable_ids-new_callable_ids) - + program.entrypoints): + # at this point we should not rename anything to the names of + # entrypoints + for new_func_id in (new_callable_ids-renames.keys()) & set( + self.history.keys()): + if old_func_id == self.history[new_func_id]: + renames[new_func_id] = old_func_id + break + # }}} + + for e in renamed_entrypoints: + new_subkernel = self.callables[e].subkernel.copy(name=self.history[e]) + new_subkernel = rename_resolved_functions_in_a_single_kernel( + new_subkernel, renames) + new_callables[self.history[e]] = self.callables[e].copy( + subkernel=new_subkernel) + + for func_id in new_callable_ids-renamed_entrypoints: + in_knl_callable = self.callables[func_id] + if isinstance(in_knl_callable, CallableKernel): + # if callable kernel, perform renames inside its expressions. + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, renames) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + if func_id in renames: + new_func_id = renames[func_id] + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = (in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=new_func_id))) + new_callables[new_func_id] = in_knl_callable + else: + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=func_id)) + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + + # }}} + + def __getitem__(self, name): + result = self.callables[name] + return result + + +# {{{ helper functions + +def make_program(kernel): + """ + Returns an instance of :class:`loopy.Program` with *kernel* as the only + callable kernel. + """ + + program = Program( + callables_table={ + kernel.name: CallableKernel(kernel)}, + target=kernel.target) + + return program + + +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ + def _collective_transform(*args, **kwargs): + if "program" in kwargs: + program_or_kernel = kwargs.pop("program") + elif "kernel" in kwargs: + program_or_kernel = kwargs.pop("kernel") + else: + program_or_kernel = args[0] + args = args[1:] + + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel, *args, **kwargs) + + return wraps(transform_for_single_kernel)(_collective_transform) + + +def update_table(callables_table, clbl_id, clbl): + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(clbl, InKernelCallable) + + for i, c in callables_table.items(): + if c == clbl: + return i, callables_table + + while clbl_id in callables_table: + clbl_id = next_indexed_function_identifier(clbl_id) + + callables_table[clbl_id] = clbl + + return clbl_id, callables_table + +# }}} + + +def resolve_callables(program): + """ + Returns a :class:`Program` with known :class:`pymbolic.primitives.Call` + expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. + """ + from loopy.library.function import get_loopy_callables + from loopy.kernel import KernelState + + if program.state >= KernelState.CALLS_RESOLVED: + # program's callables have been resolved + return program + + # get registered callables + known_callables = dict(program.callables_table) + # get target specific callables + known_callables.update(program.target.get_device_ast_builder().known_callables) + # get loopy specific callables + known_callables.update(get_loopy_callables()) + + callables_table = {} + + # callables: name of the calls seen in the program + callables = set(program.entrypoints) + + while callables: + clbl_name = callables.pop() + clbl = known_callables[clbl_name] + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + clbl_resolver = CallableResolver(rule_mapping_context, + frozenset(known_callables)) + knl = rule_mapping_context.finish_kernel(clbl_resolver.map_kernel(knl)) + knl = knl.copy(state=KernelState.CALLS_RESOLVED) + + # add the updated callable kernel to the table + callables_table[clbl_name] = clbl.copy(subkernel=knl) + + # note the resolved callable for traversal + callables.update(clbl_resolver.calls_resolved - set(callables_table)) + elif isinstance(clbl, ScalarCallable): + # nothing to resolve within a scalar callable + callables_table[clbl_name] = clbl + else: + raise NotImplementedError(f"{type(clbl)}") + + return program.copy(callables_table=callables_table) + + +# vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c6a9ec3ac1ed27048321deffabc746617e600dd8..91f7cf70f773234f069acba91be84a8745f50440 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1748,16 +1748,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse): return insn_ids_alive_at_scope[-1] -def append_barrier_or_raise_error(schedule, dep, verify_only): +def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only): if verify_only: from loopy.diagnostic import MissingBarrierError raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " + "%s: Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " "instruction option to state that no " "synchronization is needed)" % ( + kernel_name, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id), dep.variable, @@ -1828,7 +1829,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 for dep in chain.from_iterable( dep_tracker.gen_dependencies_with_target_at(insn) for insn in loop_head): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) # This barrier gets inserted outside the loop, hence it is # executed unconditionally and so kills all sources before # the loop. @@ -1860,7 +1862,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 elif isinstance(sched_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) dep_tracker.discard_all_sources() break result.append(sched_item) @@ -1926,7 +1929,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1939,17 +1942,18 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - yield from generate_loop_schedules_inner(kernel, debug_args=debug_args) + yield from generate_loop_schedules_inner(kernel, + callables_table, debug_args=debug_args) -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -2061,7 +2065,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2118,7 +2123,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2128,19 +2133,19 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, callables_table): warn_with_kernel( kernel, "get_one_scheduled_kernel_deprecated", "get_one_scheduled_kernel is deprecated. " "Use get_one_linearized_kernel instead.", DeprecationWarning) - return get_one_linearized_kernel(kernel) + return get_one_linearized_kernel(kernel, callables_table) -def get_one_linearized_kernel(kernel): +def get_one_linearized_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2158,7 +2163,8 @@ def get_one_linearized_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 0192aa27dad1ac279c1e6b877fca3d2c1cae7e2d..f5ecf5b757ceeca222a7bd77cbefe9d236c20c60 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,4 +1,9 @@ -__copyright__ = "Copyright (C) 2015 James Stevens" +__copyright__ = """ +Copyright (C) 2015 James Stevens +Copyright (C) 2018 Kaushik Kulkarni +Copyright (C) 2019 Andreas Kloeckner +""" + __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,12 +29,14 @@ import loopy as lp from islpy import dim_type import islpy as isl from pymbolic.mapper import CombineMapper -from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method +from loopy.kernel.function_interface import CallableKernel +from loopy.program import Program +from functools import partial __doc__ = """ @@ -37,6 +44,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: ToCountMap +.. autoclass:: ToCountPolynomialMap .. autoclass:: CountGranularity .. autoclass:: Op .. autoclass:: MemAccess @@ -56,6 +64,19 @@ __doc__ = """ """ +# FIXME: +# - The SUBGROUP granularity is completely broken if the root kernel +# contains the grid and the operations get counted in the callee. +# To test, most of those are set to WORKITEM instead below (marked +# with FIXMEs). This leads to value mismatches and key errors in +# the tests. +# - Currently, nothing prevents summation across different +# granularities, which is guaranteed to yield bogus results. +# - AccessFootprintGatherer needs to be redone to match get_op_map and +# get_mem_access_map style +# - Test for the subkernel functionality need to be written + + def get_kernel_parameter_space(kernel): return isl.Space.create_from_names(kernel.isl_context, set=[], params=sorted(list(kernel.outer_params()))).params() @@ -69,11 +90,25 @@ def get_kernel_zero_pwqpolynomial(kernel): # {{{ GuardedPwQPolynomial +def _get_param_tuple(obj): + return tuple( + obj.get_dim_name(dim_type.param, i) + for i in range(obj.dim(dim_type.param))) + + class GuardedPwQPolynomial: def __init__(self, pwqpolynomial, valid_domain): + assert isinstance(pwqpolynomial, isl.PwQPolynomial) self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain + assert (_get_param_tuple(pwqpolynomial.space) + == _get_param_tuple(valid_domain.space)) + + @property + def space(self): + return self.valid_domain.space + def __add__(self, other): if isinstance(other, GuardedPwQPolynomial): return GuardedPwQPolynomial( @@ -122,7 +157,7 @@ class GuardedPwQPolynomial: return str(self.pwqpolynomial) def __repr__(self): - return repr(self.pwqpolynomial) + return "Guarded" + repr(self.pwqpolynomial) # }}} @@ -130,7 +165,20 @@ class GuardedPwQPolynomial: # {{{ ToCountMap class ToCountMap: - """Maps any type of key to an arithmetic type. + """A map from work descriptors like :class:`Op` and :class:`MemAccess` + to any arithmetic type. + + .. automethod:: __getitem__ + .. automethod:: __str__ + .. automethod:: __repr__ + .. automethod:: __len__ + .. automethod:: get + .. automethod:: items + .. automethod:: keys + .. automethod:: values + + .. automethod:: copy + .. automethod:: with_set_attributes .. automethod:: filter_by .. automethod:: filter_by_func @@ -141,17 +189,20 @@ class ToCountMap: """ - def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): - if init_dict is None: - init_dict = {} - self.count_map = init_dict - self.val_type = val_type + def __init__(self, count_map=None): + if count_map is None: + count_map = {} + + self.count_map = count_map + + def _zero(self): + return 0 def __add__(self, other): result = self.count_map.copy() for k, v in other.count_map.items(): result[k] = self.count_map.get(k, 0) + v - return ToCountMap(result, self.val_type) + return self.copy(count_map=result) def __radd__(self, other): if other != 0: @@ -159,13 +210,14 @@ class ToCountMap: "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) + return self def __mul__(self, other): if isinstance(other, GuardedPwQPolynomial): - return ToCountMap({ - index: self.count_map[index]*other - for index in self.keys()}) + return self.copy({ + index: other*value + for index, value in self.count_map.items()}) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {} {}." @@ -174,21 +226,17 @@ class ToCountMap: __rmul__ = __mul__ def __getitem__(self, index): - try: - return self.count_map[index] - except KeyError: - #TODO what is the best way to handle this? - if self.val_type is GuardedPwQPolynomial: - return GuardedPwQPolynomial.zero() - else: - return 0 - - def __setitem__(self, index, value): - self.count_map[index] = value + return self.count_map[index] def __repr__(self): return repr(self.count_map) + def __str__(self): + return "\n".join( + f"{k}: {v}" + for k, v in sorted(self.count_map.items(), + key=lambda k: str(k))) + def __len__(self): return len(self.count_map) @@ -201,17 +249,19 @@ class ToCountMap: def keys(self): return self.count_map.keys() - def pop(self, item): - return self.count_map.pop(item) + def values(self): + return self.count_map.values() + + def copy(self, count_map=None): + if count_map is None: + count_map = self.count_map - def copy(self): - return ToCountMap(dict(self.count_map), self.val_type) + return type(self)(count_map=count_map) def with_set_attributes(self, **kwargs): - return ToCountMap({ + return self.copy(count_map={ key.copy(**kwargs): val - for key, val in self.count_map.items()}, - self.val_type) + for key, val in self.count_map.items()}) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -238,28 +288,25 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - from loopy.types import to_loopy_type - if "dtype" in kwargs.keys(): - kwargs["dtype"] = [to_loopy_type(d) for d in kwargs["dtype"]] + class _Sentinel: + pass - # for each item in self.count_map - for self_key, self_val in self.items(): - try: - # check to see if key attribute values match all filters - for arg_field, allowable_vals in kwargs.items(): - attr_val = getattr(self_key, arg_field) - # see if the value is in the filter list - if attr_val not in allowable_vals: - break - else: # loop terminated without break or error - result_map[self_key] = self_val - except(AttributeError): - # the field passed is not a field of this key - continue + new_kwargs = {} + for arg_field, allowable_vals in kwargs.items(): + if arg_field == "dtype": + from loopy.types import to_loopy_type + allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals] + + new_kwargs[arg_field] = allowable_vals - return result_map + for key, val in self.count_map.items(): + if all(getattr(key, arg_field, _Sentinel) in allowable_vals + for arg_field, allowable_vals in new_kwargs.items()): + new_count_map[key] = val + + return self.copy(count_map=new_count_map) def filter_by_func(self, func): """Keep items that pass a test. @@ -286,14 +333,13 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - # for each item in self.count_map, call func on the key - for self_key, self_val in self.items(): + for self_key, self_val in self.count_map.items(): if func(self_key): - result_map[self_key] = self_val + new_count_map[self_key] = self_val - return result_map + return self.copy(count_map=new_count_map) def group_by(self, *args): """Group map items together, distinguishing by only the key fields @@ -341,7 +387,7 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} # make sure all item keys have same type if self.count_map: @@ -350,22 +396,17 @@ class ToCountMap: raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") else: - return result_map - - # for each item in self.count_map - for self_key, self_val in self.items(): - new_key = key_type() + return self - # set all specified fields - for field in args: - setattr(new_key, field, getattr(self_key, field)) + for self_key, self_val in self.count_map.items(): + new_key = key_type( + **{ + field: getattr(self_key, field) + for field in args}) - if new_key in result_map.keys(): - result_map[new_key] += self_val - else: - result_map[new_key] = self_val + new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val - return result_map + return self.copy(count_map=new_count_map) def to_bytes(self): """Convert counts to bytes using data type in map key. @@ -398,48 +439,74 @@ class ToCountMap: """ - result = self.copy() - - for key, val in self.items(): - bytes_processed = int(key.dtype.itemsize) * val - result[key] = bytes_processed + new_count_map = {} - #TODO again, is this okay? - result.val_type = int + for key, val in self.count_map.items(): + new_count_map[key] = int(key.dtype.itemsize) * val - return result + return self.copy(new_count_map) def sum(self): - """Add all counts in ToCountMap. + """:return: A sum of the values of the dictionary.""" - :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the - sum of counts. + total = self._zero() - """ + for k, v in self.count_map.items(): + total = v + total - if self.val_type is GuardedPwQPolynomial: - total = GuardedPwQPolynomial.zero() - else: - total = 0 - - for k, v in self.items(): - total += v return total - #TODO test and document - def eval(self, params): - result = self.copy() - for key, val in self.items(): - result[key] = val.eval_with_dict(params) - result.val_type = int - return result +# }}} + + +# {{{ ToCountPolynomialMap + +class ToCountPolynomialMap(ToCountMap): + """Maps any type of key to a :class:`islpy.PwQPolynomial` or a + :class:`GuardedPwQPolynomial`. + """ + + def __init__(self, space, count_map=None): + if not isinstance(space, isl.Space): + raise TypeError( + "first argument to ToCountPolynomialMap must be " + "of type islpy.Space") - def eval_and_sum(self, params): - """Add all counts in :class:`ToCountMap` and evaluate with provided - parameter dict. + assert space.is_params() + self.space = space - :return: An :class:`int` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided. + space_param_tuple = _get_param_tuple(space) + + for key, val in count_map.items(): + if isinstance(val, isl.PwQPolynomial): + assert val.dim(dim_type.out) == 1 + elif isinstance(val, GuardedPwQPolynomial): + assert val.pwqpolynomial.dim(dim_type.out) == 1 + else: + raise TypeError("unexpected value type") + + assert _get_param_tuple(val.space) == space_param_tuple + + super().__init__(count_map) + + def _zero(self): + space = self.space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) + + def copy(self, count_map=None, space=None): + if count_map is None: + count_map = self.count_map + + if space is None: + space = self.space + + return type(self)(space, count_map) + + def eval_and_sum(self, params=None): + """Add all counts and evaluate with provided parameter dict *params* + + :return: An :class:`int` containing the sum of all counts + evaluated with the parameters provided. Example usage:: @@ -454,18 +521,69 @@ class ToCountMap: # (now use these counts to, e.g., predict performance) """ + if params is None: + params = {} + return self.sum().eval_with_dict(params) # }}} +# {{{ subst_into_to_count_map + +def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial, get_param_subst_domain + + poly = subst_into_pwqpolynomial( + new_space, guarded_poly.pwqpolynomial, subst_dict) + + valid_domain = guarded_poly.valid_domain + i_begin_subst_space = valid_domain.dim(dim_type.param) + + valid_domain, subst_domain, _ = get_param_subst_domain( + new_space, guarded_poly.valid_domain, subst_dict) + + valid_domain = valid_domain & subst_domain + valid_domain = valid_domain.project_out(dim_type.param, 0, i_begin_subst_space) + return GuardedPwQPolynomial(poly, valid_domain) + + +def subst_into_to_count_map(space, tcm, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial + new_count_map = {} + for key, value in tcm.count_map.items(): + if isinstance(value, GuardedPwQPolynomial): + new_count_map[key] = subst_into_guarded_pwqpolynomial( + space, value, subst_dict) + + elif isinstance(value, isl.PwQPolynomial): + new_count_map[key] = subst_into_pwqpolynomial(space, value, subst_dict) + + elif isinstance(value, int): + new_count_map[key] = value + + else: + raise ValueError("unexpected value type") + + return tcm.copy(space=space, count_map=new_count_map) + +# }}} + + def stringify_stats_mapping(m): + + from warnings import warn + warn("stringify_stats_mapping is deprecated and will be removed in 2020." + " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2) + result = "" for key in sorted(m.keys(), key=lambda k: str(k)): result += ("{} : {}\n".format(key, m[key])) return result +# {{{ CountGranularity + class CountGranularity: """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -492,10 +610,12 @@ class CountGranularity: WORKGROUP = "workgroup" ALL = [WORKITEM, SUBGROUP, WORKGROUP] +# }}} + # {{{ Op descriptor -class Op(Record): +class Op(ImmutableRecord): """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -521,34 +641,41 @@ class Op(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ - def __init__(self, dtype=None, name=None, count_granularity=None): + def __init__(self, dtype=None, name=None, count_granularity=None, + kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, dtype=dtype, name=name, - count_granularity=count_granularity) - else: + + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, dtype=to_loopy_type(dtype), name=name, - count_granularity=count_granularity) + dtype = to_loopy_type(dtype) - def __hash__(self): - return hash(repr(self)) + super().__init__(dtype=dtype, name=name, + count_granularity=count_granularity, + kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return f"Op({self.dtype}, {self.name}, {self.count_granularity})" + if self.kernel_name is not None: + return (f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}",' + f' "{self.kernel_name}")') + else: + return f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}")' # }}} # {{{ MemAccess descriptor -class MemAccess(Record): +class MemAccess(ImmutableRecord): """A descriptor for a type of memory access. .. attribute:: mtype @@ -608,12 +735,15 @@ class MemAccess(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, direction=None, variable=None, *, variable_tags=None, variable_tag=None, - count_granularity=None): + count_granularity=None, kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " @@ -638,18 +768,16 @@ class MemAccess(Record): # }}} - if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, + if dtype is not None: + from loopy.types import to_loopy_type + dtype = to_loopy_type(dtype) + + ImmutableRecord.__init__(self, mtype=mtype, dtype=dtype, + lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tags=variable_tags, - count_granularity=count_granularity) - else: - from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, gid_strides=gid_strides, - direction=direction, variable=variable, - variable_tags=variable_tags, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) @property def variable_tag(self): @@ -666,13 +794,12 @@ class MemAccess(Record): return tag def __hash__(self): - # Note that this means lid_strides and gid_strides must be sorted - # in self.__repr__() + # dicts in gid_strides and lid_strides aren't natively hashable return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess({}, {}, {}, {}, {}, {}, {}, {})".format( + return "MemAccess({}, {}, {}, {}, {}, {}, {}, {}, {})".format( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -682,33 +809,101 @@ class MemAccess(Record): self.direction, self.variable, self.variable_tags, - self.count_granularity) + self.count_granularity, + self.kernel_name) +# }}} + + +# {{{ Sync descriptor + +class Sync(ImmutableRecord): + """A descriptor for a type of synchronization. + + .. attribute:: kind + + A string describing the synchronization kind, e.g. ``"barrier_global"`` or + ``"barrier_local"`` or ``"kernel_launch"``. + + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ + + def __init__(self, kind=None, kernel_name=None): + super().__init__(kind=kind, kernel_name=kernel_name) + + def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness + return f"Sync({self.kind}, {self.kernel_name})" # }}} -# {{{ counter base +# {{{ CounterBase class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, callables_table, kernel_rec): self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.callables_table = callables_table + self.kernel_rec = kernel_rec + + from loopy.type_inference import TypeReader + self.type_inf = TypeReader(knl, callables_table) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 + + @property + @memoize_method + def param_space(self): + return get_kernel_parameter_space(self.knl) + + def new_poly_map(self, count_map): + return ToCountPolynomialMap(self.param_space, count_map) + + def new_zero_poly_map(self): + return self.new_poly_map({}) def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() def map_call(self, expr): - return self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import (CallableKernel, + get_kw_pos_association) + from loopy.kernel.data import ValueArg + if isinstance(clbl, CallableKernel): + sub_result = self.kernel_rec(clbl.subkernel) + _, pos_to_kw = get_kw_pos_association(clbl.subkernel) + + subst_dict = { + pos_to_kw[i]: param + for i, param in enumerate(expr.parameters) + if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + return subst_into_to_count_map( + self.param_space, + sub_result, subst_dict) \ + + self.rec(expr.parameters) + + else: + raise NotImplementedError() + + def map_call_with_kwargs(self, expr): + # FIXME + raise NotImplementedError() def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) else: - return ToCountMap() + return self.new_zero_poly_map() map_product = map_sum @@ -737,8 +932,8 @@ class CounterBase(CombineMapper): map_derivative = map_common_subexpression map_slice = map_common_subexpression - # preprocessing should have removed these def map_reduction(self, expr): + # preprocessing should have removed these raise RuntimeError("%s encountered %s--not supposed to happen" % (type(self).__name__, type(expr).__name__)) @@ -748,60 +943,81 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, count_within_subscripts=True): - self.knl = knl + def __init__(self, knl, callables_table, kernel_rec, + count_within_subscripts=True): + super().__init__( + knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + + arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() map_tagged_variable = map_constant map_variable = map_constant def map_call(self, expr): - return ToCountMap( - {Op(dtype=self.type_inf(expr), - name="func:"+str(expr.function), - count_granularity=CountGranularity.SUBGROUP): 1} - ) + self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.new_poly_map( + {Op(dtype=self.type_inf(expr), + name="func:"+clbl.name, + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.parameters) + else: + return super().map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: return self.rec(expr.index) else: - return ToCountMap() + return self.new_zero_poly_map() + + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() def map_sum(self, expr): assert expr.children - return ToCountMap( + return self.new_poly_map( {Op(dtype=self.type_inf(expr), name="add", - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1} + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(dtype=self.type_inf(expr), + return sum(self.new_poly_map({Op(dtype=self.type_inf(expr), name="mul", - count_granularity=CountGranularity.SUBGROUP): 1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): self.one}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(dtype=self.type_inf(expr), + self.new_poly_map({Op(dtype=self.type_inf(expr), name="mul", - count_granularity=CountGranularity.SUBGROUP): -1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): -self.one}) def map_quotient(self, expr, *args): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="div", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -809,32 +1025,36 @@ class ExpressionOpCounter(CounterBase): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="pow", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="shift", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="bw", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="bw", - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)}) \ + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -855,9 +1075,10 @@ class ExpressionOpCounter(CounterBase): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="maxmin", - count_granularity=CountGranularity.SUBGROUP): + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -898,6 +1119,8 @@ class _IndexStrideCoefficientCollector(CoefficientCollector): # }}} +# {{{ _get_lid_and_gid_strides + def _get_lid_and_gid_strides(knl, array, index): # find all local and global index tags and corresponding inames from loopy.symbolic import get_dependencies @@ -982,28 +1205,49 @@ def _get_lid_and_gid_strides(knl, array, index): return get_iname_strides(lid_to_iname), get_iname_strides(gid_to_iname) +# }}} -class MemAccessCounter(CounterBase): - pass + +# {{{ MemAccessCounterBase + +class MemAccessCounterBase(CounterBase): + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() + + def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.rec(expr.parameters) + else: + return super().map_call(expr) + +# }}} # {{{ LocalMemAccessCounter -class LocalMemAccessCounter(MemAccessCounter): +class LocalMemAccessCounter(MemAccessCounterBase): + local_mem_count_granularity = CountGranularity.SUBGROUP + def count_var_access(self, dtype, name, index): - sub_map = ToCountMap() + count_map = {} if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( array.address_space == AddressSpace.LOCAL): if index is None: # no subscript - sub_map[MemAccess( + count_map[MemAccess( mtype="local", dtype=dtype, - count_granularity=CountGranularity.SUBGROUP) - ] = 1 - return sub_map + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one + return self.new_poly_map(count_map) array = self.knl.temporary_variables[name] @@ -1015,15 +1259,16 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - sub_map[MemAccess( + count_map[MemAccess( mtype="local", dtype=dtype, lid_strides=dict(sorted(lid_strides.items())), gid_strides=dict(sorted(gid_strides.items())), variable=name, - count_granularity=CountGranularity.SUBGROUP)] = 1 + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one - return sub_map + return self.new_poly_map(count_map) def map_variable(self, expr): return self.count_var_access( @@ -1042,7 +1287,7 @@ class LocalMemAccessCounter(MemAccessCounter): # {{{ GlobalMemAccessCounter -class GlobalMemAccessCounter(MemAccessCounter): +class GlobalMemAccessCounter(MemAccessCounterBase): def map_variable(self, expr): name = expr.name @@ -1050,17 +1295,18 @@ class GlobalMemAccessCounter(MemAccessCounter): array = self.knl.arg_dict[name] else: # this is a temporary variable - return ToCountMap() + return self.new_zero_poly_map() if not isinstance(array, lp.ArrayArg): # this array is not in global memory - return ToCountMap() + return self.new_zero_poly_map() - return ToCountMap({MemAccess(mtype="global", - dtype=self.type_inf(expr), lid_strides={}, - gid_strides={}, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) + return self.new_poly_map({MemAccess(mtype="global", + dtype=self.type_inf(expr), lid_strides={}, + gid_strides={}, variable=name, + count_granularity=CountGranularity.WORKITEM, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.index) def map_subscript(self, expr): name = expr.aggregate.name @@ -1086,19 +1332,27 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - count_granularity = CountGranularity.WORKITEM if ( - 0 in lid_strides and lid_strides[0] != 0 - ) else CountGranularity.SUBGROUP + global_access_count_granularity = CountGranularity.SUBGROUP - return ToCountMap({MemAccess( + # Account for broadcasts once per subgroup + count_granularity = CountGranularity.WORKITEM if ( + # if the stride in lid.0 is known + 0 in lid_strides + and + # it is nonzero + lid_strides[0] != 0 + ) else global_access_count_granularity + + return self.new_poly_map({MemAccess( mtype="global", dtype=self.type_inf(expr), lid_strides=dict(sorted(lid_strides.items())), gid_strides=dict(sorted(gid_strides.items())), variable=name, variable_tags=var_tags, - count_granularity=count_granularity - ): 1} + count_granularity=count_granularity, + kernel_name=self.knl.name, + ): self.one} ) + self.rec(expr.index_tuple) # }}} @@ -1174,10 +1428,19 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count def add_assumptions_guard(kernel, pwqpolynomial): - return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + return GuardedPwQPolynomial( + pwqpolynomial, + kernel.assumptions.align_params(pwqpolynomial.space)) def count(kernel, set, space=None): + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in kernel.callables_table.items() + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return count(kernel[kernel_names[0]], set, space) + try: if space is not None: set = set.align_params(space) @@ -1186,7 +1449,7 @@ def count(kernel, set, space=None): except AttributeError: pass - count = isl.PwQPolynomial.zero( + total_count = isl.PwQPolynomial.zero( set.space .drop_dims(dim_type.set, 0, set.dim(dim_type.set)) .add_dims(dim_type.set, 1)) @@ -1248,7 +1511,7 @@ def count(kernel, set, space=None): # }}} if bset_count is not None: - count += bset_count + total_count += bset_count is_subset = bset <= bset_rebuilt is_superset = bset >= bset_rebuilt @@ -1273,12 +1536,12 @@ def count(kernel, set, space=None): "number of integer points in your loop " "domain.") - return add_assumptions_guard(kernel, count) + return add_assumptions_guard(kernel, total_count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes): +def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1327,29 +1590,29 @@ def count_inames_domain(knl, inames): return count(knl, domain, space=space) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, callables_table, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = insn.within_inames if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) c = count_inames_domain(knl, insn_inames) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes) + unused_fac = get_unused_hw_axes_factor(knl, callables_table, + insn, disregard_local_axes=disregard_local_axes) return c * unused_fac else: return c -@memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1361,19 +1624,21 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, callables_table, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: - # get the group size + # {{{ compute workgroup_size + from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1393,15 +1658,18 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, % (CountGranularity.SUBGROUP, local_size)) workgroup_size *= s + # }}} + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for work-group size " "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" + "multiple device programs present, actual sub-group count may be " "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: # this should not happen since this is enforced in Op/MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" @@ -1413,17 +1681,52 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, - count_within_subscripts=True, subgroup_size=None): +def _get_op_map_for_single_kernel(knl, callables_table, + count_redundant_work, + count_within_subscripts, subgroup_size): + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_op_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) + + op_counter = ExpressionOpCounter(knl, callables_table, kernel_rec, + count_within_subscripts) + op_map = op_counter.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignees) + op_counter(insn.expression) + for key, val in ops.count_map.items(): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + op_map = op_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, + count_within_subscripts=True, subgroup_size=None, + entrypoint=None): """Count the number of operations in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.types.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1474,53 +1777,37 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - subgroup_size = _process_subgroup_size(knl, subgroup_size) + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + entrypoint = list(program.entrypoints)[0] - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, count_within_subscripts) + assert entrypoint in program.entrypoints - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = preprocess_program(program) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - for key, val in ops.count_map.items(): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): - pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - if numpy_types: - return ToCountMap( - init_dict={ - Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity): - ct - for op, ct in op_map.count_map.items()}, - val_type=op_map.val_type - ) - else: - return op_map + return _get_op_map_for_single_kernel( + program[entrypoint], program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) # }}} +# {{{ subgoup size finding + def _find_subgroup_size_for_knl(knl): from loopy.target.pyopencl import PyOpenCLTarget if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: @@ -1572,20 +1859,66 @@ def _process_subgroup_size(knl, subgroup_size_requested): "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size_requested)) +# }}} + # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): +def _get_mem_access_map_for_single_kernel(knl, callables_table, + count_redundant_work, subgroup_size): + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_mem_access_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) + + access_counter_g = GlobalMemAccessCounter( + knl, callables_table, kernel_rec) + access_counter_l = LocalMemAccessCounter( + knl, callables_table, kernel_rec) + access_map = access_counter_g.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + insn_access_map = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + for assignee in insn.assignees: + insn_access_map = insn_access_map + ( + access_counter_g(assignee) + + access_counter_l(assignee) + ).with_set_attributes(direction="store") + + for key, val in insn_access_map.count_map.items(): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + access_map = access_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return access_map + + +def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, + subgroup_size=None, entrypoint=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.types.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1662,72 +1995,86 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ - subgroup_size = _process_subgroup_size(knl, subgroup_size) + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + entrypoint = list(program.entrypoints)[0] - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + assert entrypoint in program.entrypoints - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + from loopy.preprocess import preprocess_program, infer_unknown_types - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - insn_access_map = ( - access_counter_g(insn.expression) - + access_counter_l(insn.expression) - ).with_set_attributes(direction="load") + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - for assignee in insn.assignees: - insn_access_map += ( - access_counter_g(assignee) - + access_counter_l(assignee) - ).with_set_attributes(direction="store") - - for key, val in insn_access_map.count_map.items(): - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): - pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - if numpy_types: - return ToCountMap( - init_dict={ - MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - lid_strides=mem_access.lid_strides, - gid_strides=mem_access.gid_strides, - direction=mem_access.direction, - variable=mem_access.variable, - variable_tags=mem_access.variable_tags, - count_granularity=mem_access.count_granularity): - ct - for mem_access, ct in access_map.count_map.items()}, - val_type=access_map.val_type - ) - else: - return access_map + return _get_mem_access_map_for_single_kernel( + program[entrypoint], program.callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def _get_synchronization_map_for_single_kernel(knl, callables_table, + subgroup_size=None): + + knl = lp.get_one_scheduled_kernel(knl, callables_table) + + from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, + CallKernel, ReturnFromKernel, RunInstruction) + + kernel_rec = partial(_get_synchronization_map_for_single_kernel, + callables_table=callables_table, + subgroup_size=subgroup_size) + sync_counter = CounterBase(knl, callables_table, kernel_rec) + sync_map = sync_counter.new_zero_poly_map() + + iname_list = [] + + for sched_item in knl.schedule: + if isinstance(sched_item, EnterLoop): + if sched_item.iname: # (if not empty) + iname_list.append(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + if sched_item.iname: # (if not empty) + iname_list.pop() + + elif isinstance(sched_item, Barrier): + sync_map = sync_map + ToCountMap( + {Sync( + "barrier_%s" % sched_item.synchronization_kind, + knl.name): count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, RunInstruction): + pass + + elif isinstance(sched_item, CallKernel): + sync_map = sync_map + ToCountMap( + {Sync("kernel_launch", knl.name): + count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, ReturnFromKernel): + pass + + else: + raise LoopyError("unexpected schedule item: %s" + % type(sched_item).__name__) + + return sync_map + + +def get_synchronization_map(program, subgroup_size=None, entrypoint=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1763,79 +2110,30 @@ def get_synchronization_map(knl, subgroup_size=None): # (now use this count to, e.g., predict performance) """ + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, - CallKernel, ReturnFromKernel, RunInstruction) - from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - iname_list = [] - - result = ToCountMap() - - one = isl.PwQPolynomial("{ 1 }") + entrypoint = list(program.entrypoints)[0] - def get_count_poly(iname_list): - if iname_list: # (if iname_list is not empty) - ct = (count(knl, ( - knl.get_inames_domain(iname_list). - project_out_except(iname_list, [dim_type.set]) - )), ) - return reduce(mul, ct) - else: - return one + assert entrypoint in program.entrypoints + from loopy.preprocess import preprocess_program, infer_unknown_types - for sched_item in knl.schedule: - if isinstance(sched_item, EnterLoop): - if sched_item.iname: # (if not empty) - iname_list.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - if sched_item.iname: # (if not empty) - iname_list.pop() - - elif isinstance(sched_item, Barrier): - result = result + ToCountMap({"barrier_%s" % - sched_item.synchronization_kind: - get_count_poly(iname_list)}) - - elif isinstance(sched_item, CallKernel): - result = result + ToCountMap( - {"kernel_launch": get_count_poly(iname_list)}) - - elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): - pass - - else: - raise LoopyError("unexpected schedule item: %s" - % type(sched_item).__name__) + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - return result + return _get_synchronization_map_for_single_kernel( + program[entrypoint], program.callables_table, + subgroup_size=subgroup_size) # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): - """Return a dictionary mapping ``(var_name, direction)`` to - :class:`islpy.Set` instances capturing which indices of each the array - *var_name* are read/written (where *direction* is either ``read`` or - ``write``. - - :arg ignore_uncountable: If *False*, an error will be raised for accesses - on which the footprint cannot be determined (e.g. data-dependent or - nonlinear indices) - """ - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - +def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable): write_footprints = [] read_footprints = [] @@ -1858,6 +2156,48 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False, entrypoint=None): + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.Set` instances capturing which indices of each the array + *var_name* are read/written (where *direction* is either ``read`` or + ``write``. + + :arg ignore_uncountable: If *False*, an error will be raised for accesses + on which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) + """ + + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + + assert entrypoint in program.entrypoints + + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) + + write_footprints = [] + read_footprints = [] + + write_footprints, read_footprints = _gather_access_footprints_for_single_kernel( + program[entrypoint], ignore_uncountable) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1872,7 +2212,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1883,12 +2223,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e083a87d814e5e935edee1be795eac9d8244f693..82f7525dcc43bdd089042a2f72f511d054ab0101 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -138,7 +138,14 @@ class IdentityMapperMixin: return expr def map_type_annotation(self, expr, *args, **kwargs): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_sub_array_ref(self, expr, *args, **kwargs): + return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), + self.rec(expr.subscript, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -197,15 +204,34 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_sub_array_ref(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.swept_inames, *args) + self.rec(expr.subscript, *args) + + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): def map_reduction(self, expr, *args, **kwargs): return self.rec(expr.expr, *args, **kwargs) + def map_sub_array_ref(self, expr): + return self.combine(( + self.rec(expr.subscript), + self.combine(tuple(self.rec(idx) for idx in expr.swept_inames)))) + map_linear_subscript = CombineMapperBase.map_subscript @@ -265,6 +291,16 @@ class StringifyMapper(StringifyMapperBase): return "cast({}, {})".format( repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + # underlining a resolved call + return "\u0332".join(str(expr.function)) + + def map_sub_array_ref(self, expr, prec): + return "[{inames}]: {subscr}".format( + inames=",".join(self.rec(iname, prec) for iname in + expr.swept_inames), + subscr=self.rec(expr.subscript, prec)) + class EqualityPreservingStringifyMapper(StringifyMapperBase): """ @@ -304,7 +340,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation)): # noqa + or type(expr.function) != type(other.function)): # noqa return [] return self.rec(expr.expr, other.expr, unis) @@ -339,6 +375,13 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args, **kwargs) for child in expr.parameters) + def map_call_with_kwargs(self, expr, *args): + # Loopy does not have first-class functions. Do not descend + # into 'function' attribute of Call. + return self.combine( + self.rec(child, *args) for child in expr.parameters+tuple( + expr.kw_parameters.values())) + def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) return deps - {p.Variable(iname) for iname in expr.inames} @@ -349,11 +392,18 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr, *args, **kwargs): return set() + def map_sub_array_ref(self, expr, *args, **kwargs): + deps = self.rec(expr.subscript, *args, **kwargs) + return deps - set(expr.swept_inames) + map_linear_subscript = DependencyMapperBase.map_subscript def map_type_cast(self, expr, *args, **kwargs): return self.rec(expr.child, *args, **kwargs) + def map_resolved_function(self, expr): + return self.rec(expr.function) + def map_literal(self, expr): return set() @@ -621,7 +671,6 @@ class Reduction(LoopyExpressionBase): Represents a reduction operation on :attr:`expr` across :attr:`inames`. .. attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` .. attribute:: inames @@ -748,6 +797,170 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") + +class ResolvedFunction(LoopyExpressionBase): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ReductionOpFunction + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, ReductionOpFunction): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() + + mapper_method = intern("map_resolved_function") + + +class EvaluatorWithDeficientContext(PartialEvaluationMapper): + """Evaluation Mapper that does not need values of all the variables + involved in the expression. + + Returns the expression with the values mapped from :attr:`context`. + """ + def map_variable(self, expr): + if expr.name in self.context: + return self.context[expr.name] + else: + return expr + + +class VariableInAnExpression(CombineMapper): + def __init__(self, variables_to_search): + assert(all(isinstance(variable, p.Variable) for variable in + variables_to_search)) + self.variables_to_search = variables_to_search + + def combine(self, values): + return any(values) + + def map_variable(self, expr): + return expr in self.variables_to_search + + def map_constant(self, expr): + return False + + +class SweptInameStrideCollector(CoefficientCollectorBase): + """ + Mapper to compute the coefficient swept inames for :class:`SubArrayRef`. + """ + def map_algebraic_leaf(self, expr): + # subscripts that are not involved in :attr:`target_names` are treated + # as constants. + if isinstance(expr, p.Subscript) and (self.target_names is None + or expr.aggregate.name not in self.target_names): + return {1: expr} + + return super().map_algebraic_leaf(expr) + + +def get_start_subscript_from_sar(sar, kernel): + """ + Returns an instance of :class:`pymbolic.primitives.Subscript`, the + beginning subscript of the array swept by the *SubArrayRef*. + + **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning + subscript would be ``a[0, j, 0, l]`` + """ + + def _get_lower_bound(iname): + pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff + return int(pw_aff_to_expr(pwaff)) + + swept_inames_to_zeros = { + swept_iname.name: _get_lower_bound(swept_iname.name) for + swept_iname in sar.swept_inames} + + return EvaluatorWithDeficientContext(swept_inames_to_zeros)( + sar.subscript) + + +class SubArrayRef(LoopyExpressionBase): + """ + An algebraic expression to map an affine memory layout pattern (known as + sub-arary) as consecutive elements of the sweeping axes which are defined + using :attr:`SubArrayRef.swept_inames`. + + .. attribute:: swept_inames + + An instance of :class:`tuple` denoting the axes to which the sub array + is supposed to be mapper to. + + .. attribute:: subscript + + An instance of :class:`pymbolic.primitives.Subscript` denoting the + array in the kernel. + """ + + init_arg_names = ("swept_inames", "subscript") + + def __init__(self, swept_inames, subscript): + + # {{{ sanity checks + + if not isinstance(swept_inames, tuple): + assert isinstance(swept_inames, p.Variable) + swept_inames = (swept_inames,) + + assert isinstance(swept_inames, tuple) + + for iname in swept_inames: + assert isinstance(iname, p.Variable) + assert isinstance(subscript, p.Subscript) + + # }}} + + self.swept_inames = swept_inames + self.subscript = subscript + + def __getinitargs__(self): + return (self.swept_inames, self.subscript) + + def get_hash(self): + return hash((self.__class__, self.swept_inames, self.subscript)) + + def is_equal(self, other): + return (other.__class__ == self.__class__ + and other.subscript == self.subscript + and other.swept_inames == self.swept_inames) + + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() + + mapper_method = intern("map_sub_array_ref") + # }}} @@ -780,9 +993,12 @@ def get_reduction_inames(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tags - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -981,12 +1197,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tags = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tags, (), expn_state) + return self.map_substitution(name, tags, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -1038,6 +1256,10 @@ class RuleAwareIdentityMapper(IdentityMapper): return sym def __call__(self, expr, kernel, insn): + """ + :arg insn: A :class:`~loopy.kernel.InstructionBase` of which *expr* is + a part of, or *None* if *expr*'s source is not an instruction. + """ from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1288,6 +1510,14 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + for par in expr.kw_parameters.values(): + if not isinstance(par, SubArrayRef): + raise LoopyError("Keyword Arguments is only supported for" + " array arguments--use positional order to specify" + " the order of the arguments in the call.") + return IdentityMapper.map_call_with_kwargs(self, expr) + # {{{ customization to pymbolic parser @@ -1318,8 +1548,10 @@ class LoopyParser(ParserBase): return float(val) # generic float def parse_prefix(self, pstate): - from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier, + _openbracket, _closebracket, _colon) import loopy as lp + if pstate.is_next(_less): pstate.advance() if pstate.is_next(_greater): @@ -1335,6 +1567,26 @@ class LoopyParser(ParserBase): return TypeAnnotation( typename, self.parse_expression(pstate, _PREC_UNARY)) + + elif pstate.is_next(_openbracket): + rollback_pstate = pstate.copy() + pstate.advance() + pstate.expect_not_end() + if pstate.is_next(_closebracket): + swept_inames = () + else: + swept_inames = self.parse_expression(pstate) + + pstate.expect(_closebracket) + pstate.advance() + if pstate.is_next(_colon): + # pstate.expect(_colon): + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: + pstate = rollback_pstate + return super().parse_prefix(rollback_pstate) else: return super().parse_prefix(pstate) @@ -2032,7 +2284,7 @@ def get_access_map(domain, subscript, assumptions=None, shape=None, except ExpressionToAffineConversionError as err: shape_aff = None - if shape is not None: + if shape is not None and shape[idim] is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) except ExpressionToAffineConversionError: @@ -2166,6 +2418,10 @@ class BatchedAccessMapMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + def map_sub_array_ref(self, expr, inames): + total_inames = inames | {iname.name for iname in expr.swept_inames} + return self.rec(expr.subscript, total_inames) + class AccessRangeMapper: """**IMPORTANT** diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 8af47c41222416fbd2dbe3dc5a88d4090a4a06f0..8706c4a37728973c572cd7acc679e23da9c13932 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -87,7 +87,7 @@ class TargetBase: def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, callables_table): pass # }}} @@ -157,8 +157,15 @@ class ASTBuilderBase: # {{{ library - def function_manglers(self): - return [] + @property + def known_callables(self): + """ + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for the + function ids known to *self.target*. + """ + # FIXME: @inducer: Do we need to move this to TargetBase? + return {} def symbol_manglers(self): return [] @@ -170,6 +177,10 @@ class ASTBuilderBase: # {{{ code generation guts + @property + def ast_module(self): + raise NotImplementedError() + def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): raise NotImplementedError diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 92b94d658fd24f44ff4b8b0ba748f3cd5212617a..a45965c800316c0922b3186053fd3f61c96e5e63 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -24,7 +24,6 @@ THE SOFTWARE. """ import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -32,6 +31,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from loopy.tools import remove_common_indentation @@ -72,11 +72,13 @@ class DTypeRegistryWrapper: return self.wrapped_registry.get_or_register_dtype(names, dtype) def dtype_to_ctype(self, dtype): - from loopy.types import LoopyType, NumpyType + from loopy.types import LoopyType, NumpyType, OpaqueType assert isinstance(dtype, LoopyType) if isinstance(dtype, NumpyType): return self.wrapped_registry.dtype_to_ctype(dtype) + elif isinstance(dtype, OpaqueType): + return dtype.name else: raise LoopyError( "unable to convert type '%s' to C" @@ -447,42 +449,60 @@ def c_symbol_mangler(kernel, name): # float NAN as defined in C99 standard if name == "NAN": return NumpyType(np.dtype(np.float32)), name + + if name in ["INT_MAX", "INT_MIN"]: + return NumpyType(np.dtype(np.int32)), name + return None # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - # {{{ (abs|max|min) -> (fabs|fmax|fmin) + def with_types(self, arg_id_to_dtype, callables_table): + name = self.name - if name in ["abs", "min", "max"]: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - if dtype.kind == "f": - name = "f" + name + # {{{ (abs|max|min) -> (fabs|fmax|fmin) - # }}} + if name in ["abs", "min", "max"]: + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_id_to_dtype.values()]) + if dtype.kind == "f": + name = "f" + name + + # }}} - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind in "fc"): + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc", "abs", "real", "imag"]: - dtype = arg_dtypes[0].numpy_dtype - real_dtype = np.empty(0, dtype=dtype).real.dtype + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") - if modify_name: + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + real_dtype = np.empty(0, dtype=dtype).real.dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + + # for CUDA, C Targets the name must be modified if real_dtype == np.float64: pass # fabs elif real_dtype == np.float32: @@ -491,29 +511,45 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): and real_dtype == np.float128): # pylint:disable=no-member name = name + "l" # fabsl else: - raise LoopyTypeError(f"{name} does not support type {real_dtype}") + raise LoopyTypeError("{} does not support type {}".format(name, + dtype)) if dtype.kind == "c": name = "c" + name - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if name in ["abs", "real", "imag"]: + dtype = real_dtype + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) - # binary functions - if (name in ["fmax", "fmin", "copysign", "pow"] - and len(arg_dtypes) == 2): + # binary functions + elif name in ["fmax", "fmin", "pow", "atan2", "copysign"]: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - real_dtype = np.empty(0, dtype=dtype).real.dtype + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) - if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c": - raise LoopyTypeError(f"{name} does not support complex numbers") + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - elif real_dtype.kind in "fc": - if modify_name: + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + real_dtype = np.empty(0, dtype=dtype).real.dtype + + if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support complex numbers") + + elif real_dtype.kind in "fc": if real_dtype == np.float64: pass # fmin elif real_dtype == np.float32: @@ -523,50 +559,48 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): name = name + "l" # fminl else: raise LoopyTypeError("%s does not support type %s" - % (name, real_dtype)) - - if dtype.kind == "c": - name = "c" + name # cpow - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - # complex functions - if (name in ["abs", "real", "imag"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "c"): - dtype = arg_dtypes[0].numpy_dtype - real_dtype = np.empty(0, dtype=dtype).real.dtype - - if modify_name: - if real_dtype == np.float64: - pass # fabs - elif real_dtype == np.float32: - name = name + "f" # fabsf - elif (hasattr(np, "float128") - and real_dtype == np.float128): # pylint:disable=no-member - name = name + "l" # fabsl - else: - raise LoopyTypeError(f"{name} does not support type {real_dtype}") - - name = "c" + name - - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(real_dtype),), - arg_dtypes=arg_dtypes) - - if (name == "isnan" and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(np.int32),), - arg_dtypes=arg_dtypes) + % (name, dtype)) + if dtype.kind == "c": + name = "c" + name # cpow + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + elif name == "isnan": + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + return ( + self.copy( + name_in_target=name, + arg_id_to_dtype={ + 0: NumpyType(dtype), + -1: NumpyType(np.int32)}), + callables_table) + + +def get_c_callables(): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", + "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc", "isnan", "real", "imag"] - return None + return {id_: CMathCallable(id_) for id_ in cmath_ids} # }}} @@ -574,12 +608,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super().function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super().symbol_manglers() + [ @@ -592,6 +620,12 @@ class CFamilyASTBuilder(ASTBuilderBase): _preamble_generator, ]) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_c_callables()) + return callables + # }}} # {{{ code generation @@ -678,9 +712,13 @@ class CFamilyASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" + if codegen_state.is_entrypoint: + name = Value("void", name) + else: + name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration( - Value("void", name), + name, [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info])) @@ -709,8 +747,8 @@ class CFamilyASTBuilder(ASTBuilderBase): temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( - temporaries_read_in_subkernel(kernel, subkernel) | - temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( kernel.temporary_variables.values(), @@ -831,6 +869,11 @@ class CFamilyASTBuilder(ASTBuilderBase): # {{{ code generation guts + @property + def ast_module(self): + import cgen + return cgen + def get_expression_to_code_mapper(self, codegen_state): return self.get_expression_to_c_expression_mapper(codegen_state) @@ -993,83 +1036,33 @@ class CFamilyASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.callables_table[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == "loopy_make_tuple"): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes, - mangle_result.result_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) + # takes "is_returned" to infer whether insn.assignees[0] is a part of + # LHS. + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast_lazy( - lambda: mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): @@ -1207,7 +1200,6 @@ class ExecutableCTarget(CTarget): """ An executable CFamilyTarget that uses (by default) JIT compilation of C-code """ - def __init__(self, compiler=None, fortran_abi=False): super().__init__(fortran_abi=fortran_abi) from loopy.target.c.c_execution import CCompiler @@ -1215,7 +1207,8 @@ class ExecutableCTarget(CTarget): def get_kernel_executor(self, knl, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(knl, compiler=self.compiler) + return CKernelExecutor(knl, entrypoint=kwargs.pop("entrypoint"), + compiler=self.compiler) def get_host_ast_builder(self): # enable host code generation diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index d73912460a2c99075c875375056be5922a98d692..1150a9f9b930148d28a7a5b658d3e2a17404b5b1 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -163,7 +163,8 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join(f'"{arg.name}": {arg.name}' for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info @@ -404,7 +405,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, entrypoint, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -413,54 +414,57 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super().__init__(kernel) + super().__init__(program, entrypoint) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, kernel, entrypoint, codegen_result): generator = CExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(kernel, entrypoint, codegen_result) def get_wrapper_generator(self): return CExecutionWrapperGenerator() @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = "\n".join([dev_code, "", host_code]) - if self.kernel.options.write_cl: + if self.program[entrypoint].options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program[entrypoint].options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor all_code = "\n".join([dev_code, "", host_code]) c_kernels = [] + for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, - self.compiler)) + codegen_result.implemented_data_infos[entrypoint], all_code, + self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], + invoker=self.get_invoker(program, entrypoint, codegen_result)) # }}} @@ -477,7 +481,9 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs["entrypoint"], + self.arg_to_dtype_set(kwargs)) + kwargs.pop("entrypoint") - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 853722c3c4533865c7c313d09f16105b9c9c46cc..336028925f8aaf2eea7865d080ac88e1c398b033 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -35,9 +35,9 @@ from pymbolic import var from loopy.expression import dtype_to_type_context -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType from loopy.target.c import CExpression @@ -62,7 +62,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeReader(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -176,6 +177,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_tagged_variable(self, expr, type_context): return var(expr.name) + def map_sub_array_ref(self, expr, type_context): + from loopy.symbolic import get_start_subscript_from_sar + return var("&")(self.rec(get_start_subscript_from_sar(expr, self.kernel), + type_context)) + def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")] @@ -439,104 +445,12 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function - - # {{{ implement indexof, indexof_vec - - if identifier.name in ["indexof", "indexof_vec"]: - if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) - arg, = expr.parameters - if not isinstance(arg, Subscript): - raise LoopyError( - "argument to %s must be a subscript" % identifier.name) - - ary = self.find_array(arg) - - from loopy.kernel.array import get_access_info - from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) - - from loopy.kernel.data import ImageArg - if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) - - if identifier.name == "indexof": - return access_info.subscripts[0] - elif identifier.name == "indexof_vec": - from loopy.kernel.array import VectorArrayDimTag - ivec = None - for iaxis, dim_tag in enumerate(ary.dim_tags): - if isinstance(dim_tag, VectorArrayDimTag): - ivec = iaxis - - if ivec is None: - return access_info.subscripts[0] - else: - return ( - access_info.subscripts[0]*ary.shape[ivec] - + access_info.vector_index) - - else: - raise RuntimeError("should not get here") - - # }}} - - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes, - mangle_result.result_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + return ( + self.codegen_state.callables_table[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables @@ -563,6 +477,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_power(self, expr, type_context): tgt_dtype = self.infer_type(expr) + base_dtype = self.infer_type(expr.base) exponent_dtype = self.infer_type(expr.exponent) from pymbolic.primitives import is_constant, is_zero @@ -584,10 +499,21 @@ class ExpressionToCExpressionMapper(IdentityMapper): "int_pow", func_name, (tgt_dtype, exponent_dtype), (tgt_dtype, ))) + # FIXME: This need some more callables to be registered. return var(func_name)(self.rec(expr.base, type_context), self.rec(expr.exponent, type_context)) else: - return self.rec(var("pow")(expr.base, expr.exponent), type_context) + from loopy.codegen import SeenFunction + clbl = self.codegen_state.ast_builder.known_callables["pow"] + clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype}, + self.codegen_state.callables_table)[0] + self.codegen_state.seen_functions.add( + SeenFunction( + clbl.name, clbl.name_in_target, + (base_dtype, exponent_dtype), + (tgt_dtype,))) + return var(clbl.name_in_target)(self.rec(expr.base, type_context), + self.rec(expr.exponent, type_context)) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 67dc1fe249af91d9b73a7162867dcd98c7ef6bc7..63018189e7aaa729f6a008b4768d479f78e3cfeb 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -29,10 +29,11 @@ from pytools import memoize_method from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -110,43 +111,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, callables_table): - return dtype, name + name = self.name - if name in ["pow"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] - if dtype == np.float64: - pass # pow - elif dtype == np.float32: - name = name + "f" # powf - else: - raise RuntimeError(f"{name} does not support type {dtype}") + # {{{ sanity checks + + for id, dtype in arg_id_to_dtype.items(): + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + if dtype is not None and dtype.kind == "c": + raise LoopyTypeError( + f"'{name}' does not support complex arguments.") + + # }}} + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) + for id in range(-1, num_args)} + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) - return dtype, name + if name == "dot": + # CUDA dot function: + # Performs dot product. Input types: vector and return type: scalar. + for i in range(2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + input_dtype = arg_id_to_dtype[0] - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + scalar_dtype, offset, field_name = input_dtype.fields["x"] + return_dtype = scalar_dtype + return self.copy(arg_id_to_dtype={0: input_dtype, 1: input_dtype, + -1: return_dtype}) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - return None + +def get_cuda_callables(): + cuda_func_ids = {"dot"} | set(_CUDA_SPECIFIC_FUNCTIONS) + return {id_: CudaCallable(name=id_) for id_ in cuda_func_ids} # }}} @@ -192,6 +232,9 @@ class CudaTarget(CFamilyTarget): super().__init__() + def split_kernel_at_global_barriers(self): + return True + def get_device_ast_builder(self): return CUDACASTBuilder(self) @@ -225,16 +268,51 @@ class CudaTarget(CFamilyTarget): # }}} +# {{{ preamable generator + +def cuda_preamble_generator(preamble_info): + from loopy.types import AtomicNumpyType + seen_64_bit_atomics = any( + isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 + for dtype in preamble_info.seen_atomic_dtypes) + + if seen_64_bit_atomics: + # Source: + # docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + yield ("00_enable_64bit_atomics", """ + #if __CUDA_ARCH__ < 600 + __device__ double atomicAdd(double* address, double val) + { + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + } while (assumed != old); + + return __longlong_as_double(old); + } + #endif + """) + +# }}} + + # {{{ ast builder class CUDACASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): - return ( - super().function_manglers() + [ - cuda_function_mangler - ]) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_cuda_callables()) + return callables # }}} @@ -260,7 +338,8 @@ class CUDACASTBuilder(CFamilyASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): @@ -273,6 +352,12 @@ class CUDACASTBuilder(CFamilyASTBuilder): return FunctionDeclarationWrapper(fdecl) + def preamble_generators(self): + + return ( + super().preamble_generators() + [ + cuda_preamble_generator]) + # }}} # {{{ code generation guts @@ -350,6 +435,97 @@ class CUDACASTBuilder(CFamilyASTBuilder): return CudaConstant(arg_decl) + # {{{ code generation for atomic update + + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum + from cgen import Statement + from pymbolic.mapper.stringifier import PREC_NONE + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ + np.int32, np.int64, np.float32, np.float64]: + # atomicAdd + if isinstance(rhs_expr, Sum): + ecm = self.get_expression_to_code_mapper(codegen_state) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Statement("atomicAdd(&{}, {})".format( + lhs_expr_code, rhs_expr_code)) + else: + from cgen import Block, DoWhile, Assign + from loopy.target.c import POD + old_val_var = codegen_state.var_name_generator("loopy_old_val") + new_val_var = codegen_state.var_name_generator("loopy_new_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + }) + + lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) + + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + from loopy.symbolic import SubstitutionMapper + + subst = SubstitutionMapper( + make_subst_func({lhs_expr: var(old_val_var)})) + rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + cast_str = "" + old_val = old_val_var + new_val = new_val_var + + if lhs_dtype.numpy_dtype.kind == "f": + if lhs_dtype.numpy_dtype == np.float32: + ctype = "int" + elif lhs_dtype.numpy_dtype == np.float64: + ctype = "long" + else: + assert False + + old_val = "*(%s *) &" % ctype + old_val + new_val = "*(%s *) &" % ctype + new_val + cast_str = "(%s *) " % (ctype) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + new_val_var), + DoWhile( + "atomicCAS(" + "%(cast_str)s&(%(lhs_expr)s), " + "%(old_val)s, " + "%(new_val)s" + ") != %(old_val)s" + % { + "cast_str": cast_str, + "lhs_expr": lhs_expr_code, + "old_val": old_val, + "new_val": new_val, + }, + Block([ + Assign(old_val_var, lhs_expr_code), + Assign(new_val_var, rhs_expr_code), + ]) + ) + ]) + else: + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + + # }}} + # }}} # }}} diff --git a/loopy/target/execution.py b/loopy/target/execution.py index d594f3e10a9fef7e3e16a2848d68eca7f29ebfc9..a8666f02bb152e008efc817da9461edce6a34d33 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -58,12 +58,13 @@ class SeparateArrayPackingController: It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program, entrypoint): + # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program[entrypoint].args: if not isinstance(arg, ArrayBase): continue @@ -79,7 +80,8 @@ class SeparateArrayPackingController: name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program[entrypoint].get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -140,7 +142,7 @@ class ExecutionWrapperGeneratorBase: # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -165,7 +167,8 @@ class ExecutionWrapperGeneratorBase: if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -212,7 +215,7 @@ class ExecutionWrapperGeneratorBase: # {{{ integer arg finding from offsets def generate_integer_arg_finding_from_offsets(self, gen, kernel, - implemented_data_info): + implemented_data_info): options = kernel.options gen("# {{{ find integer arguments from offsets") @@ -616,7 +619,7 @@ class ExecutionWrapperGeneratorBase: def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, entrypoint, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -628,12 +631,14 @@ class ExecutionWrapperGeneratorBase: kernel """ - options = kernel.options - implemented_data_info = codegen_result.implemented_data_info + options = program[entrypoint].options + #FIXME: endswith is ugly maybe make + # codegen_result.implemented_data_infos a dict? + implemented_data_info = codegen_result.implemented_data_infos[entrypoint] from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % entrypoint, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -648,21 +653,24 @@ class ExecutionWrapperGeneratorBase: self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) - + gen, program[entrypoint], implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program[entrypoint], implemented_data_info, options) + + #FIXME: should we make this as a dict as well. + host_program_name = codegen_result.host_programs[entrypoint].name - self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + self.generate_invocation(gen, host_program_name, args, + program[entrypoint], implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program[entrypoint], + implemented_data_info) if options.write_wrapper: output = gen.get() @@ -710,64 +718,66 @@ class KernelExecutorBase: .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program, entrypoint): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program + self.entrypoint = entrypoint - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program, + entrypoint) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program[entrypoint].args + if arg.is_output) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program[entrypoint].args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes + from loopy.kernel import KernelState + from loopy.program import resolve_callables - kernel = self.kernel + program = resolve_callables(self.program) if arg_to_dtype_set: var_to_dtype = {} + entry_knl = program[entrypoint] for var, dtype in arg_to_dtype_set: - try: - dest_name = kernel.impl_arg_to_arg[var].name - except KeyError: + if var in entry_knl.impl_arg_to_arg: + dest_name = entry_knl.impl_arg_to_arg[var].name + else: dest_name = var - try: - var_to_dtype[dest_name] = dtype - except KeyError: - raise LoopyError("cannot set type for '%s': " - "no known variable/argument with that name" - % var) + var_to_dtype[dest_name] = dtype - kernel = add_dtypes(kernel, var_to_dtype) + program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.state < KernelState.SCHEDULED: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + for e in program.entrypoints: + program = program.with_kernel( + get_one_scheduled_kernel(program[e], program.callables_table)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, entrypoint, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -775,9 +785,11 @@ class KernelExecutorBase: except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % + self.program.entrypoints) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(entrypoint, + arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -785,10 +797,13 @@ class KernelExecutorBase: return kernel def arg_to_dtype_set(self, kwargs): + kwargs = kwargs.copy() if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + entrypoint = kwargs.pop("entrypoint") + + impl_arg_to_arg = self.program[entrypoint].impl_arg_to_arg arg_to_dtype = {} for arg_name, val in kwargs.items(): arg = impl_arg_to_arg.get(arg_name, None) @@ -809,18 +824,18 @@ class KernelExecutorBase: # {{{ debugging aids - def get_highlighted_code(self, arg_to_dtype=None, code=None): + def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None): if code is None: - code = self.get_code(arg_to_dtype) + code = self.get_code(entrypoint, arg_to_dtype) return get_highlighted_code(code) - def get_code(self, arg_to_dtype=None): + def get_code(self, entrypoint, arg_to_dtype=None): def process_dtype(dtype): if isinstance(dtype, type) and issubclass(dtype, np.generic): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -828,22 +843,19 @@ class KernelExecutorBase: arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in arg_to_dtype.items()) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) return code.device_code() - def get_invoker_uncached(self, kernel, *args): - raise NotImplementedError() - - def get_wrapper_generator(self): + def get_invoker_uncached(self, program, entrypoint, *args): raise NotImplementedError() - def get_invoker(self, kernel, *args): + def get_invoker(self, program, entrypoint, *args): from loopy import CACHING_ENABLED - cache_key = (self.__class__.__name__, kernel) + cache_key = (self.__class__.__name__, (program, entrypoint)) if CACHING_ENABLED: try: @@ -851,9 +863,9 @@ class KernelExecutorBase: except KeyError: pass - logger.debug("%s: invoker cache miss" % kernel.name) + logger.debug("%s: invoker cache miss" % entrypoint) - invoker = self.get_invoker_uncached(kernel, *args) + invoker = self.get_invoker_uncached(program, entrypoint, *args) if CACHING_ENABLED: invoker_cache.store_if_not_present(cache_key, invoker) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index d047b6464c1fc86010d1943527af68be73278bbb..526d3855e023df4357c68ca9b59975cbb3fd908f 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -171,8 +171,9 @@ class ISPCTarget(CFamilyTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, callables_table): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 096b4de940dd487e8333f397197fe451239caf90..fa356068eeb617c606acdf4eef2dd9c762ee35ed 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -30,11 +30,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -180,77 +180,225 @@ VECTOR_LITERAL_FUNCS = { } -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "pow" and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - if dtype == np.float64: - name = "powf64" - elif dtype == np.float32: - name = "powf32" - else: - raise LoopyTypeError(f"'pow' does not support type {dtype}.") - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) - - return None + def with_types(self, arg_id_to_dtype, callables_table): + name = self.name + + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support type {dtype}") + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + # binary functions + elif name in ["fmax", "fmin", "atan2", "copysign"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + #FIXME: Do we need to raise here?: + # The pattern we generally follow is that if we don't find + # a function, then we just return None + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + elif name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + common_dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if common_dtype.kind in ["u", "i", "f"]: + if common_dtype.kind == "f": + name = "f"+name + + dtype = NumpyType(common_dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, common_dtype)) + + elif name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError(f"'{name}' can take only 2 arguments.") + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + callables_table) + + elif name == "pow": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError(f"'{name}' can take only 2 arguments.") + + common_dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if common_dtype == np.float64: + name = "powf64" + elif common_dtype == np.float32: + name = "powf32" + else: + raise LoopyTypeError(f"'pow' does not support type {dtype}.") + + result_dtype = NumpyType(common_dtype) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: result_dtype, + 0: common_dtype, 1: common_dtype}), + callables_table) + + elif name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + elif name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in + range(count)} + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def get_opencl_callables(): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = ( + {"max", "min", "dot", "pow", "abs", "acos", "asin", + "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", + "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc"} + | set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS) + | set(VECTOR_LITERAL_FUNCS)) + + return {id_: OpenCLCallable(name=id_) for id_ in + opencl_function_ids} # }}} @@ -274,6 +422,8 @@ def opencl_symbol_mangler(kernel, name): return NumpyType(np.dtype(np.int32)), name elif name.startswith("LONG_"): return NumpyType(np.dtype(np.int64)), name + elif name == "HUGE_VAL": + return NumpyType(np.dtype(np.float64)), name else: return None @@ -310,6 +460,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -417,13 +568,11 @@ class OpenCLTarget(CFamilyTarget): class OpenCLCASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): - return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super().function_manglers()) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_opencl_callables()) + return callables def symbol_manglers(self): return ( @@ -432,13 +581,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super().preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -451,6 +597,11 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.is_entrypoint: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize @@ -459,7 +610,8 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bb04ddc3d8f4b01b859242766bc0230c68dccb23..70663b2daffc43da19d69473ece5369244981fd6 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -25,13 +25,13 @@ THE SOFTWARE. import numpy as np import pymbolic.primitives as p -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -130,7 +130,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -147,7 +147,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -195,36 +196,86 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, callables_table): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target=f"{tpname}_{name}", + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + callables_table) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target=f"{tpname}_{name}", + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ("u", "i"): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == "abs": + name = "fabs" + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name=f"{tpname}_{name}", - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name=f"{tpname}_{name}", - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) +def get_pyopencl_callables(): + pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"] + return {id_: PyOpenCLCallable(name=id_) for id_ in pyopencl_ids} - return None +# }}} # {{{ preamble generator @@ -569,8 +620,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -638,9 +689,10 @@ class PyOpenCLTarget(OpenCLTarget): def get_kernel_executor_cache_key(self, queue, **kwargs): return queue.context - def get_kernel_executor(self, kernel, queue, **kwargs): + def get_kernel_executor(self, program, queue, **kwargs): from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor - return PyOpenCLKernelExecutor(queue.context, kernel) + return PyOpenCLKernelExecutor(queue.context, program, + entrypoint=kwargs.pop("entrypoint")) def with_device(self, device): return type(self)(device) @@ -973,21 +1025,20 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler - return ( - [ - pyopencl_function_mangler, - random123_function_mangler - # order matters: e.g. prefer our abs() over that of the - # superclass - ] + super().function_manglers()) + @property + def known_callables(self): + from loopy.library.random123 import get_random123_callables + + # order matters: e.g. prefer our abs() over that of the + # superclass + callables = super().known_callables + callables.update(get_pyopencl_callables()) + callables.update(get_random123_callables(self.target)) + return callables def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super().preamble_generators()) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index cdee5600bb5dd0dce3a3971583604f737c6913d9..5ac37e1520abc774b877d16741302ff7b79810af 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -257,7 +257,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program, entrypoint): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -266,62 +266,69 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super().__init__(kernel) + super().__init__(program, entrypoint) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=( - kernel.target.with_device(context.devices[0]))) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=( + program.target.with_device(context.devices[0]))) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, program, entrypoint, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(program, entrypoint, codegen_result) def get_wrapper_generator(self): return PyOpenCLExecutionWrapperGenerator() @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, + arg_to_dtype_set) + # FIXME: now just need to add the types to the arguments from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if program[entrypoint].options.write_cl: + #FIXME: redirect to "translation unit" level option as well. output = dev_code - if self.kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if program[entrypoint].options.edit_cl: + #FIXME: redirect to "translation unit" level option as well. from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") import pyopencl as cl + #FIXME: redirect to "translation unit" level option as well. cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() - for dp in codegen_result.device_programs: - setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + for dp in cl_program.kernel_names.split(";"): + setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], + invoker=self.get_invoker(program, entrypoint, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -356,10 +363,12 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs["entrypoint"], + self.arg_to_dtype_set(kwargs)) + kwargs.pop("entrypoint") - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index a1557e47bdf8990e7aa89472b59f3c9fc3666a05..c7f20ff559cad5d23efed9f091a1a5407337277e 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -27,11 +27,11 @@ import numpy as np from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase -from genpy import Suite +from genpy import Suite, Collection # {{{ expression to code @@ -42,7 +42,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeReader(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -80,48 +81,37 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.callables_table[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + clbl = self.codegen_state.callables_table[ + expr.function.name] str_parameters = None + number_of_assignees = len([key for key in + clbl.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes, - mangle_result.result_dtypes)) + SeenFunction(clbl.name, + clbl.name_in_target, + clbl.input_dtypes, + clbl.result_dtypes)) - return "{}({})".format(mangle_result.target_name, ", ".join(str_parameters)) + return "{}({})".format(clbl.name_in_target, + ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -147,16 +137,6 @@ class ExpressionToPythonMapper(StringifyMapper): # }}} -# {{{ genpy extensions - -class Collection(Suite): - def generate(self): - for item in self.contents: - yield from item.generate() - -# }}} - - # {{{ ast builder def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes): @@ -185,13 +165,12 @@ class PythonASTBuilderBase(ASTBuilderBase): """A Python host AST builder for integration with PyOpenCL. """ - # {{{ code generation guts - - def function_manglers(self): - return ( - super().function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + @property + def known_callables(self): + from loopy.target.c import get_c_callables + callables = super().known_callables + callables.update(get_c_callables()) + return callables def preamble_generators(self): return ( @@ -199,6 +178,13 @@ class PythonASTBuilderBase(ASTBuilderBase): _base_python_preamble_generator ]) + # {{{ code generation guts + + @property + def ast_module(self): + import genpy + return genpy + def get_function_declaration(self, codegen_state, codegen_result, schedule_index): return None diff --git a/loopy/tools.py b/loopy/tools.py index 5be4ca6b58f0a2e0dd5907eacf4749dd3aaf927b..644082ed61143798f3c01e5af820092aabd665af 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -35,6 +35,17 @@ def is_integer(obj): return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): @@ -52,6 +63,13 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): self.key_hash.update(type(expr.operation).__name__.encode("utf-8")) self.rec(expr.expr, *args) + def map_foreign(self, expr, *args, **kwargs): + """Mapper method dispatch for non-:mod:`pymbolic` objects.""" + if expr is None: + self.key_hash.update(b"") + else: + PersistentHashWalkMapperBase.map_foreign(self, expr, *args, **kwargs) + class LoopyKeyBuilder(KeyBuilderBase): """A custom :class:`pytools.persistent_dict.KeyBuilder` subclass @@ -72,6 +90,11 @@ class LoopyKeyBuilder(KeyBuilderBase): update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) @@ -99,6 +122,8 @@ class LoopyKeyBuilder(KeyBuilderBase): else: PersistentHashWalkMapper(key_hash)(key) + update_for_PMap = update_for_dict # noqa: N815 + class PymbolicExpressionHashWrapper: def __init__(self, expression): diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index bc324d7fa96ad409c27e048a3258f6eca2d0f1b9..1e03ade94710b25cd56eecc7079afdadf567a82c 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -24,6 +24,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -34,6 +36,7 @@ __doc__ = """ # {{{ add_barrier +@iterate_over_kernels_if_given_program def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, tags=None, synchronization_kind="global", mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel @@ -53,6 +56,8 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to *synchronization_kind* """ + assert isinstance(kernel, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 0a38790152f9e1325733a8bdc47d13f05d400c39..8376688198c3cff232d9f9006883d1b236efe367 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -23,9 +23,13 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -49,7 +53,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@iterate_over_kernels_if_given_program def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index e27e5902644205e8a1643b4c243ba8ae6532fafa..5da142e3d400edf151ee755990d1fa4845aa147e 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program + + __doc__ = """ .. currentmodule:: loopy @@ -98,6 +101,7 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) +@iterate_over_kernels_if_given_program def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 84503a618b067a147dc5181c2251d17d8b83eb44..e8c4bc2e9bd687d782a4d9f71dbc5e3a54eb639b 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -30,6 +30,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pymbolic import var @@ -127,10 +130,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, callables_table, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -166,6 +169,20 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(buffer_array(kernel[kernel_names[0]], + var_name, buffer_inames, init_expression, store_expression, within, + default_tag, temporary_scope, temporary_is_local, + fetch_bounding_box, kernel.callables_table)) + + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -237,7 +254,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -525,7 +543,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -534,4 +552,25 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_callables = {} + + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + clbl = clbl.copy( + subkernel=buffer_array_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 0000000000000000000000000000000000000000..a5c4c52840541efcaf6971cfbe98e37d6367e772 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,741 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import islpy as isl + +from pytools import UniqueNameGenerator + +from loopy.kernel import LoopKernel +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + Assignment, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext, CombineMapper, IdentityMapper) +from loopy.isl_helpers import simplify_via_aff +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.program import Program + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable + +.. autofunction:: merge +""" + + +def register_callable(translation_unit, function_identifier, callable_, + redefining_not_ok=True): + """ + :param translation_unit: A :class:`loopy.Program`. + :param callable_: A :class:`loopy.InKernelCallable`. + """ + + if isinstance(callable_, LoopKernel): + callable_ = CallableKernel(callable_) + + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(callable_, InKernelCallable) + + if (function_identifier in translation_unit.callables_table) and ( + translation_unit.callables_table[function_identifier] != callable_ + and redefining_not_ok): + raise LoopyError("Redefining function identifier not allowed. Set the" + " option 'redefining_not_ok=False' to bypass this error.") + + new_callables = translation_unit.callables_table.set(function_identifier, + callable_) + + return translation_unit.copy( + callables_table=new_callables) + + +def merge(translation_units): + """ + :param translation_units: A list of :class:`loopy.Program`. + + :returns: An instance of :class:`loopy.Program` which contains all the + callables from each of the *translation_units. + """ + + for i in range(1, len(translation_units)): + if translation_units[i].target != translation_units[i-1].target: + raise LoopyError("translation units to be merged should have the" + " same target.") + + # {{{ check for callable collision + + for i, prg_i in enumerate(translation_units): + for prg_j in translation_units[i+1:]: + for clbl_name in (set(prg_i.callables_table) + & set(prg_j.callables_table)): + if (prg_i.callables_table[clbl_name] + != prg_j.callables_table[clbl_name]): + # FIXME: generate unique names + rename for the colliding + # callables + raise NotImplementedError("Translation units to be merged" + " must have different callable names" + " for now.") + + # }}} + + callables_table = {} + for trans_unit in translation_units: + callables_table.update(trans_unit.callables_table.copy()) + + return Program( + entrypoints=frozenset().union(*( + t.entrypoints or frozenset() for t in translation_units)), + callables_table=callables_table, + target=translation_units[0].target) + + +# {{{ kernel inliner mapper + +class KernelInliner(RuleAwareSubstitutionMapper): + def __init__(self, rule_mapping_context, subst_func, caller_knl, + callee_knl, callee_arg_to_call_param): + super().__init__(rule_mapping_context, subst_func, lambda *args: True) + self.caller_knl = caller_knl + self.callee_knl = callee_knl + self.callee_arg_to_call_param = callee_arg_to_call_param + + def map_subscript(self, expr, expn_state): + if expr.aggregate.name in self.callee_knl.arg_dict: + from loopy.symbolic import get_start_subscript_from_sar + from loopy.isl_helpers import simplify_via_aff + from pymbolic.primitives import Subscript, Variable + + sar = self.callee_arg_to_call_param[expr.aggregate.name] # SubArrayRef + + callee_arg = self.callee_knl.arg_dict[expr.aggregate.name] + if sar.subscript.aggregate.name in self.caller_knl.arg_dict: + caller_arg = self.caller_knl.arg_dict[sar.subscript.aggregate.name] + else: + caller_arg = self.caller_knl.temporary_variables[ + sar.subscript.aggregate.name] + + # map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple, expn_state) + + flatten_index = 0 + for i, idx in enumerate(get_start_subscript_from_sar(sar, + self.caller_knl).index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return Subscript(Variable(sar.subscript.aggregate.name), new_indices) + else: + assert expr.aggregate.name in self.callee_knl.temporary_variables + return super().map_subscript(expr, expn_state) + + def map_variable(self, expr, expn_state): + from loopy.kernel.data import ArrayArg, ValueArg + from loopy.symbolic import SubArrayRef + if expr.name in self.callee_knl.arg_dict: + arg = self.callee_knl.arg_dict[expr.name] + par = self.callee_arg_to_call_param[expr.name] + if isinstance(arg, ArrayArg): + assert arg.shape == () + assert isinstance(par, SubArrayRef) and par.swept_inames == () + return par.subscript.aggregate + else: + assert isinstance(arg, ValueArg) + return par + + else: + return super().map_variable(expr, expn_state) + +# }}} + + +# {{{ inlining of a single call instruction + +def substitute_into_domain(domain, param_name, expr, allowed_param_dims): + """ + :arg allowed_deps: A :class:`list` of :class:`str` that are + """ + import pymbolic.primitives as prim + from loopy.symbolic import get_dependencies, isl_set_from_expr + if param_name not in domain.get_var_dict(): + # param_name not in domain => domain will be unchanged + return domain + + # {{{ rename 'param_name' to avoid namespace pollution with allowed_param_dims + + dt, pos = domain.get_var_dict()[param_name] + domain = domain.set_dim_name(dt, pos, UniqueNameGenerator( + set(allowed_param_dims))(param_name)) + + # }}} + + for dep in get_dependencies(expr): + if dep in allowed_param_dims: + domain = domain.add_dims(isl.dim_type.param, 1) + domain = domain.set_dim_name( + isl.dim_type.param, + domain.dim(isl.dim_type.param)-1, + dep) + else: + raise ValueError("Augmenting caller's domain " + f"with '{dep}' is not allowed.") + + set_ = isl_set_from_expr(domain.space, + prim.Comparison(prim.Variable(param_name), + "==", + expr)) + + bset, = set_.get_basic_sets() + domain = domain & bset + + return domain.project_out(dt, pos, 1) + + +def rename_iname(domain, old_iname, new_iname): + if old_iname not in domain.get_var_dict(): + return domain + + dt, pos = domain.get_var_dict()[old_iname] + return domain.set_dim_name(dt, pos, new_iname) + + +def get_valid_domain_param_names(knl): + from loopy.kernel.data import ValueArg + return ([arg.name for arg in knl.args if isinstance(arg, ValueArg)] + + [tv.name + for tv in knl.temporary_variables.values() + if tv.shape == ()] + + list(knl.all_inames()) + ) + + +def _inline_call_instruction(caller_knl, callee_knl, call_insn): + """ + Returns a copy of *caller_knl* with the *call_insn* in the *kernel* + replaced by inlining *callee_knl* into it within it. + """ + import pymbolic.primitives as prim + from pymbolic.mapper.substitutor import make_subst_func + from loopy.kernel.data import ValueArg + + # {{{ sanity checks + + assert call_insn.expression.function.name == callee_knl.name + + # }}} + + callee_label = callee_knl.name[:4] + "_" + vng = caller_knl.get_var_name_generator() + ing = caller_knl.get_instruction_id_generator() + + # {{{ construct callee->caller name mappings + + # name_map: Mapping[str, str] + # A mapping from variable names in the callee kernel's namespace to + # the ones they would be referred by in the caller's namespace post inlining. + name_map = {} + + # only consider temporary variables and inames, arguments would be mapping + # according to the invocation in call_insn. + for name in (callee_knl.all_inames() + | set(callee_knl.temporary_variables.keys())): + new_name = vng(callee_label+name) + name_map[name] = new_name + + # }}} + + # {{{ iname_to_tags + + # new_iname_to_tags: caller's iname_to_tags post inlining + new_iname_to_tags = caller_knl.iname_to_tags + + for old_name, tags in callee_knl.iname_to_tags.items(): + new_iname_to_tags[name_map[old_name]] = tags + + # }}} + + # {{{ register callee's temps as caller's + + # new_temps: caller's temps post inlining + new_temps = caller_knl.temporary_variables.copy() + + for name, tv in callee_knl.temporary_variables.items(): + new_temps[name_map[name]] = tv.copy(name=name_map[name]) + + # }}} + + # {{{ get callee args -> parameters passed to the call + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = call_insn.assignees # writes + parameters = call_insn.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + from loopy.kernel.function_interface import get_kw_pos_association + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) + if isinstance(call_insn.expression, CallWithKwargs): + kw_parameters = call_insn.expression.kw_parameters + else: + kw_parameters = {} + + for kw, par in kw_parameters.items(): + arg_map[kw] = par + + for i, par in enumerate(parameters): + arg_map[pos_to_kw[i]] = par + + for i, assignee in enumerate(assignees): + arg_map[pos_to_kw[-i-1]] = assignee + + # }}} + + # {{{ domains/assumptions + + new_domains = callee_knl.domains.copy() + for old_iname in callee_knl.all_inames(): + new_domains = [rename_iname(dom, old_iname, name_map[old_iname]) + for dom in new_domains] + + new_assumptions = callee_knl.assumptions + + for callee_arg_name, param_expr in arg_map.items(): + if isinstance(callee_knl.arg_dict[callee_arg_name], + ValueArg): + new_domains = [ + substitute_into_domain( + dom, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + for dom in new_domains] + + new_assumptions = substitute_into_domain( + new_assumptions, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + + # }}} + + # {{{ map callee's expressions to get expressions after inlining + + rule_mapping_context = SubstitutionRuleMappingContext( + callee_knl.substitutions, vng) + smap = KernelInliner(rule_mapping_context, + make_subst_func({old_name: prim.Variable(new_name) + for old_name, new_name in name_map.items()}), + caller_knl, callee_knl, arg_map) + + callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel( + callee_knl)) + + # }}} + + # {{{ generate new ids for instructions + + insn_id_map = {} + for insn in callee_knl.instructions: + insn_id_map[insn.id] = ing(callee_label+insn.id) + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=call_insn.within_inames, + depends_on=call_insn.depends_on + ) + noop_end = NoOpInstruction( + id=call_insn.id, + within_inames=call_insn.within_inames, + depends_on=frozenset(insn_id_map.values()) + ) + + # }}} + + # {{{ map callee's instruction ids + + inlined_insns = [noop_start] + + for insn in callee_knl.instructions: + new_within_inames = (frozenset(name_map[iname] + for iname in insn.within_inames) + | call_insn.within_inames) + new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on) + | {noop_start.id}) + new_no_sync_with = frozenset((insn_id_map[id], scope) + for id, scope in insn.no_sync_with) + new_id = insn_id_map[insn.id] + + if isinstance(insn, Assignment): + new_atomicity = tuple( + type(atomicity)(name_map[atomicity.var_name]) + for atomicity in insn.atomicity) + insn = insn.copy( + id=insn_id_map[insn.id], + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, + atomicity=new_atomicity, + no_sync_with=new_no_sync_with + ) + else: + insn = insn.copy( + id=new_id, + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, + no_sync_with=new_no_sync_with + ) + inlined_insns.append(insn) + + inlined_insns.append(noop_end) + + # }}} + + # {{{ swap out call_insn with inlined_instructions + + idx = caller_knl.instructions.index(call_insn) + new_insns = (caller_knl.instructions[:idx] + + inlined_insns + + caller_knl.instructions[idx+1:]) + + # }}} + + old_assumptions, new_assumptions = isl.align_two( + caller_knl.assumptions, new_assumptions) + + return caller_knl.copy(instructions=new_insns, + temporary_variables=new_temps, + domains=caller_knl.domains+new_domains, + assumptions=old_assumptions.params() & new_assumptions.params(), + iname_to_tags=new_iname_to_tags) + +# }}} + + +# {{{ inline callable kernel + +def _inline_single_callable_kernel(caller_kernel, callee_kernel, + callables_table): + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? ~AK + if insn.expression.function.name == callee_kernel.name: + caller_kernel = _inline_call_instruction( + caller_kernel, callee_kernel, insn) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) + + return caller_kernel + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(program, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + from loopy.program import resolve_callables + program = resolve_callables(program) + program = infer_arg_descr(program) + callables_table = program.callables_table + new_callables = {} + callee = program[function_name] + + for func_id, in_knl_callable in callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + caller = in_knl_callable.subkernel + in_knl_callable = in_knl_callable.copy( + subkernel=_inline_single_callable_kernel(caller, + callee, program.callables_table)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + +# }}} + + +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) + +class DimChanger(IdentityMapper): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): + self.callee_arg_dict = callee_arg_dict + self.desired_shape = desired_shape + + def map_subscript(self, expr): + if expr.aggregate.name not in self.callee_arg_dict: + return super().map_subscript(expr) + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, callee_knl): + """ + :returns: a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimensions required by *caller_knl*. + """ + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name != + callee_knl.name): + # Call to a callable kernel can only occur through a + # CallInstruction. + continue + + def _shape_1_if_empty(shape): + assert isinstance(shape, tuple) + if shape == (): + return (1, ) + else: + return shape + + from loopy.kernel.function_interface import ( + ArrayArgDescriptor, get_arg_descriptor_for_expression, + get_kw_pos_association) + _, pos_to_kw = get_kw_pos_association(callee_knl) + arg_id_to_shape = {} + for arg_id, arg in insn.arg_id_to_val().items(): + arg_id = pos_to_kw[arg_id] + + arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) + if isinstance(arg_descr, ArrayArgDescriptor): + arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr.shape) + else: + arg_id_to_shape[arg_id] = (1, ) + + dim_changer = DimChanger( + callee_knl.arg_dict, + arg_id_to_shape) + + new_callee_insns = [] + for callee_insn in callee_knl.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions + new_callee_knl = callee_knl.copy(instructions=new_callee_insns) + + return new_callee_knl + + +class _FunctionCalledChecker(CombineMapper): + def __init__(self, func_name): + self.func_name = func_name + + def combine(self, values): + return any(values) + + def map_call(self, expr): + if expr.function.name == self.func_name: + return True + return self.combine( + tuple( + self.rec(child) for child in expr.parameters) + ) + + map_call_with_kwargs = map_call + + def map_constant(self, expr): + return False + + def map_algebraic_leaf(self, expr): + return False + + def map_kernel(self, kernel): + return any(self.rec(insn.expression) for insn in kernel.instructions if + isinstance(insn, MultiAssignmentBase)) + + +def _match_caller_callee_argument_dimension_(program, callee_function_name): + """ + Returns a copy of *program* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *program* aligned with the argument + dimensions required by *caller_knl*. + + .. note:: + + The callee kernel addressed by *callee_function_name*, should be + called at only one location throughout the program, as multiple + invocations would demand complex renaming logic which is not + implemented yet. + """ + + # {{{ sanity checks + + assert isinstance(program, Program) + assert isinstance(callee_function_name, str) + assert callee_function_name not in program.entrypoints + assert callee_function_name in program.callables_table + + # }}} + + is_invoking_callee = _FunctionCalledChecker( + callee_function_name).map_kernel + + caller_knl, = [in_knl_callable.subkernel for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel) and + is_invoking_callee(in_knl_callable.subkernel)] + + from pymbolic.primitives import Call + assert len([insn for insn in caller_knl.instructions if (isinstance(insn, + CallInstruction) and isinstance(insn.expression, Call) and + insn.expression.function.name == callee_function_name)]) == 1 + new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, program[callee_function_name]) + return program.with_kernel(new_callee_kernel) + +# }}} + + +def rename_callable(program, old_name, new_name=None, existing_ok=False): + """ + :arg program: An instance of :class:`loopy.Program` + :arg old_name: The callable to be renamed + :arg new_name: New name for the callable to be renamed + :arg existing_ok: An instance of :class:`bool` + """ + from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext) + from pymbolic import var + + assert isinstance(program, Program) + assert isinstance(old_name, str) + + if (new_name in program.callables_table) and not existing_ok: + raise LoopyError(f"callables named '{new_name}' already exists") + + if new_name is None: + namegen = UniqueNameGenerator(program.callables_table.keys()) + new_name = namegen(old_name) + + assert isinstance(new_name, str) + + new_callables_table = {} + + for name, clbl in program.callables_table.items(): + if name == old_name: + name = new_name + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + smap = RuleAwareSubstitutionMapper(rule_mapping_context, + {var(old_name): var(new_name)}.get, + within=lambda *args: True) + knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl)) + clbl = clbl.copy(subkernel=knl.copy(name=name)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + + new_callables_table[name] = clbl + + new_entrypoints = program.entrypoints.copy() + if old_name in new_entrypoints: + new_entrypoints = ((new_entrypoints | frozenset([new_name])) + - frozenset([old_name])) + + return program.copy(callables_table=new_callables_table, + entrypoints=new_entrypoints) + + +# vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 4851ffdece47dc092011991c5b7218d96ea953c0..185af24c47eaa569abf8b2ac617b7fb07fa47939 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -26,6 +26,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -136,7 +139,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -235,6 +239,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -327,9 +332,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, callables_table, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=var_descr.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -362,6 +367,29 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + # }}} @@ -384,6 +412,7 @@ def change_arg_to_image(kernel, name): # {{{ tag array axes +@iterate_over_kernels_if_given_program def tag_array_axes(kernel, ary_names, dim_tags): """ :arg dim_tags: a tuple of @@ -422,13 +451,15 @@ def tag_array_axes(kernel, ary_names, dim_tags): return kernel -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@iterate_over_kernels_if_given_program def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -453,13 +484,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(kernel): new_args = [] @@ -501,6 +534,7 @@ def remove_unused_arguments(kernel): # {{{ alias_temporaries +@iterate_over_kernels_if_given_program def alias_temporaries(kernel, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -585,11 +619,14 @@ def alias_temporaries(kernel, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -618,6 +655,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@iterate_over_kernels_if_given_program def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -691,6 +729,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@iterate_over_kernels_if_given_program def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -732,6 +771,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule( kernel, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 59428cde258436c3e30f4f82b23d9c6b423605b8..124568f4512340a812d6fd366318cceb0fea2591 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -31,6 +31,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -348,6 +349,8 @@ class DifferentiationContext: arg.dtype, shape=shape, dim_tags=dim_tags, + is_input=arg.is_input, + is_output=arg.is_output )) elif var_name in self.kernel.temporary_variables: @@ -377,6 +380,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(kernel, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index f6d0e1a0932916b9fdf59e54ef10b45f180fb962..0880c22ae7d4ba5b2f579e4579de768c16046b9c 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -27,6 +27,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -252,9 +256,6 @@ def _fuse_two_kernels(kernela, kernelb): "substitution", kernela.substitutions, kernelb.substitutions), - function_manglers=_ordered_merge_lists( - kernela.function_manglers, - kernelb.function_manglers), symbol_manglers=_ordered_merge_lists( kernela.symbol_manglers, kernelb.symbol_manglers), @@ -327,6 +328,25 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + # FIXME: This should take in inputs as (prog1, knlname1) and (prog2, + # knlname2). if prog1 == prog2 then the callable names belong to the same + # namespace, otherwise the kernel names should be uniquified. + # We should also somehow be able to know that callables like "sin"/"cos" + # belong to the global namespace and need not be uniquified. + if all(isinstance(kernel, Program) for kernel in kernels): + new_kernels = [] + for knl in kernels: + kernel_names = [i for i, clbl in + knl.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise NotImplementedError("Kernel containing more than one" + " callable kernel, not allowed for now.") + new_kernels.append(knl[kernel_names[0]]) + + kernels = new_kernels[:] + + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -405,6 +425,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # }}} - return result + from loopy.program import make_program + return make_program(result).with_entrypoints(result.name) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 32c56a5a3af5776d12a3a7a8b603e10c18389d59..324cec8e9a61df3e9ef51e2539ba7bdd3c9f6a67 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -29,6 +29,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel + __doc__ = """ .. currentmodule:: loopy @@ -74,6 +78,7 @@ __doc__ = """ # {{{ set loop priority +@iterate_over_kernels_if_given_program def set_loop_priority(kernel, loop_priority): from warnings import warn warn("set_loop_priority is deprecated. Use prioritize_loops instead. " @@ -88,6 +93,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -102,6 +108,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -330,6 +338,7 @@ def _split_iname_backend(kernel, iname_to_split, # {{{ split iname +@iterate_over_kernels_if_given_program def split_iname(kernel, split_iname, inner_length, *, outer_iname=None, inner_iname=None, @@ -356,6 +365,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -372,6 +383,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@iterate_over_kernels_if_given_program def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -506,6 +518,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super().map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """In a sense, the inverse of :func:`split_iname`. Takes in inames, finds their bounds (all but the first have to be bounded), and combines @@ -606,8 +619,8 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): new_insns = [ insn.copy( - within_inames=subst_within_inames(insn.within_inames)) - for insn in kernel.instructions] + within_inames=subst_within_inames(insn.within_inames)) if + within(kernel, insn) else insn for insn in kernel.instructions] kernel = (kernel .copy( @@ -632,7 +645,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) - return kernel + return remove_unused_inames(kernel, inames) # }}} @@ -662,7 +675,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -832,6 +847,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) +@iterate_over_kernels_if_given_program def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, tags={}): """ @@ -992,7 +1008,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(kernel, use_boostable_into=None): +def get_iname_duplication_options(kernel, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1022,6 +1038,13 @@ def get_iname_duplication_options(kernel, use_boostable_into=None): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ + if isinstance(kernel, Program): + if len([clbl for clbl in kernel.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + kernel = kernel[list(kernel.entrypoints)[0]] + + assert isinstance(kernel, LoopKernel) + if use_boostable_into: raise LoopyError("'use_boostable_into=True' is no longer supported.") @@ -1069,6 +1092,10 @@ def has_schedulable_iname_nesting(kernel): :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ + if isinstance(kernel, Program): + if len([clbl for clbl in kernel.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + kernel = kernel[list(kernel.entrypoints)[0]] return not bool(next(get_iname_duplication_options(kernel), False)) # }}} @@ -1076,6 +1103,7 @@ def has_schedulable_iname_nesting(kernel): # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1325,6 +1353,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1344,6 +1373,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1367,6 +1397,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1698,6 +1729,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1744,6 +1776,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(kernel, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the @@ -1782,6 +1815,7 @@ def add_inames_to_insn(kernel, inames, insn_match): # }}} +@iterate_over_kernels_if_given_program def add_inames_for_unused_hw_axes(kernel, within=None): """ Returns a kernel with inames added to each instruction diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 045850651f94ebed65afc24b0008a712b047dd20..a48e8eda7472a73116a9cfcb2c567b23191ead93 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -21,15 +21,38 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + if isinstance(program, LoopKernel): + return find_instructions_in_single_kernel(program, insn_match) + + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -54,6 +77,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -71,6 +95,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -88,7 +113,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -119,6 +145,7 @@ def add_dependency(kernel, insn_match, depends_on): # {{{ remove_instructions +@iterate_over_kernels_if_given_program def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. @@ -209,6 +236,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -231,6 +259,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -263,18 +292,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -327,6 +359,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py new file mode 100644 index 0000000000000000000000000000000000000000..b8db7f43f90a5a1203dea470c9a0ba6f8fa21cae --- /dev/null +++ b/loopy/transform/make_scalar.py @@ -0,0 +1,51 @@ +from pymbolic.primitives import Variable +from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) +from loopy.kernel.data import ValueArg +from loopy.transform.iname import remove_unused_inames + + +class ScalarChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, var_name): + self.var_name = var_name + super().__init__(rule_mapping_context) + + def map_subscript(self, expr, expn_state): + if expr.aggregate.name == self.var_name: + return Variable(self.var_name) + + return super().map_subscript(expr, expn_state) + + +def make_scalar(kernel, var_name): + rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, + kernel.get_var_name_generator()) + + kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel) + + new_args = [ValueArg(arg.name, arg.dtype, target=arg.target, + is_output=arg.is_output) if arg.name == var_name else arg for + arg in kernel.args] + new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) + if tv.name == var_name else (tv.name, tv) for tv in + kernel.temporary_variables.values()) + + return kernel.copy(args=new_args, temporary_variables=new_temps) + + +def remove_invariant_inames(kernel): + inames_used = set() + untagged_inames = ( + kernel.all_inames() - frozenset(kernel.iname_to_tags.keys())) + for insn in kernel.instructions: + for iname in ((insn.read_dependency_names() + | insn.write_dependency_names()) + & untagged_inames): + inames_used.add(iname) + + removable_inames = untagged_inames - inames_used + + new_insns = [insn.copy(within_inames=insn.within_inames-removable_inames) + for insn in kernel.instructions] + + return remove_unused_inames(kernel.copy(instructions=new_insns), + removable_inames) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 0000000000000000000000000000000000000000..cf0730760417e9c439e05fcd9f19449cb98aba02 --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,340 @@ +__copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +def pack_and_unpack_args_for_call_for_single_kernel(kernel, + callables_table, call_name, args_to_pack=None, + args_to_unpack=None): + """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + must be packed. If set *None*, it is interpreted that all the array + arguments would be packed. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` + which must be unpacked. If set *None*, it is interpreted that + all the array arguments should be unpacked. + """ + assert isinstance(kernel, LoopKernel) + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + old_insn_to_new_insns = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + if insn.expression.function.name not in callables_table: + continue + + in_knl_callable = callables_table[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args_to_pack is None: + args_to_pack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + if args_to_unpack is None: + args_to_unpack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + + # {{{ sanity checks for args + + assert isinstance(args_to_pack, list) + assert isinstance(args_to_unpack, list) + + for arg in args_to_pack: + found_sub_array_ref = False + + for par in parameters + insn.assignees: + # checking that the given args is a sub array ref + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + for arg in args_to_unpack: + if arg not in args_to_pack: + raise LoopyError("Argument %s should be packed in order to be " + "unpacked." % arg) + + # }}} + + packing_insns = [] + unpacking_insns = [] + + # {{{ handling ilp tags + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = {iname for iname in insn.within_inames + if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) + for tag in kernel.iname_to_tags.get(iname, []))} + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + # }}} + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + # dict to store the new assignees and parameters, the mapping pattern + # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} + + for arg_id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in + args_to_pack): + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + + new_pack_inames = {iname: var(vng(iname.name + + "_pack")) for iname in p.swept_inames} + new_unpack_inames = {iname: var(vng(iname.name + + "_unpack")) for iname in p.swept_inames} + + # Updating the domains corresponding to the new inames. + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=arg_in_caller.dtype, + dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[arg_id].shape, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) + + # {{{ getting the lhs for packing and rhs for unpacking + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) + + # }}} + + packing_insns.append(Assignment( + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | { + new_pack_inames[i].name for i in p.swept_inames} | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack"), + depends_on_is_final=True + )) + + if p.subscript.aggregate.name in args_to_unpack: + unpacking_insns.append(Assignment( + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | { + new_unpack_inames[i].name for i in p.swept_inames} | ( + new_ilp_inames), + id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), + depends_on_is_final=True + )) + + # {{{ creating the sweep inames for the new sub array refs + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[arg_id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[arg_id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[arg_id] = SubArrayRef( + tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[arg_id] = p + + if packing_insns: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_call_insn = insn.with_transformed_expressions(subst_mapper) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) + new_call_insn = new_call_insn.copy( + depends_on=new_call_insn.depends_on | { + pack.id for pack in packing_insns}, + within_inames=new_call_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_call_insn.expression.function(*new_params), + assignees=new_assignees) + old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] + + unpacking_insns) + + if old_insn_to_new_insns: + new_instructions = [] + for insn in kernel.instructions: + if insn.id in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn.id]) + else: + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set(old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set(old_insn_to_new_insns): + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn_id]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + + +def pack_and_unpack_args_for_call(program, *args, **kwargs): + assert isinstance(program, Program) + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + +# vim: foldmethod=marker diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 390fafb2b71b705acaa990e45d2d4d4b9fc59cbe..455ce31d03fa30476b9154ea2773a06b3db1b17d 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -24,6 +24,11 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import iterate_over_kernels_if_given_program, Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.diagnostic import LoopyError + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -40,7 +45,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -242,7 +249,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, return kernel -split_arg_axis = MovedFunctionDeprecationWrapper(split_array_dim) +split_arg_axis = (MovedFunctionDeprecationWrapper(split_array_dim)) # }}} @@ -366,7 +373,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -384,6 +393,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): ``loopy.split_array_dim`` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -399,6 +409,15 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): # {{{ find_padding_multiple def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in kernel.callables_table.items() + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return find_padding_multiple(kernel[kernel_names[0]], variable, axis, + align_bytes, allowed_waste) + assert isinstance(kernel, LoopKernel) + arg = kernel.arg_dict[variable] if arg.dim_tags is None: @@ -436,6 +455,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)} arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 60fef9e9852fcff2e6a3a9929d45bc59508fbcb7..52feb577a21ba473827bd70830373e91ec0dd1f0 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -37,6 +40,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -64,18 +68,8 @@ def assume(kernel, assumptions): # {{{ fix_parameter -def fix_parameters(kernel, within=None, **value_dict): - """Fix the values of the arguments to specific constants. - - *value_dict* consists of *name*/*value* pairs, where *name* will be fixed - to be *value*. *name* may refer to :ref:`domain-parameters` or - :ref:`arguments`. - """ - - if not value_dict: - return kernel - - def process_set_one_param(s, name, value): +def _fix_parameter(kernel, name, value, remove_argument, within=None): + def process_set(s): var_dict = s.get_var_dict() try: @@ -95,15 +89,10 @@ def fix_parameters(kernel, within=None, **value_dict): return s - def process_set(s): - for name, value in value_dict.items(): - s = process_set_one_param(s, name, value) - return s - new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic.mapper.substitutor import make_subst_func - subst_func = make_subst_func(value_dict) + subst_func = make_subst_func({name: value}) from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper subst_map = SubstitutionMapper(subst_func) @@ -115,8 +104,7 @@ def fix_parameters(kernel, within=None, **value_dict): from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: - if arg.name in value_dict.keys(): - # remove from argument list + if arg.name == name and remove_argument: continue if not isinstance(arg, ArrayBase): @@ -146,6 +134,29 @@ def fix_parameters(kernel, within=None, **value_dict): )) +@iterate_over_kernels_if_given_program +def fix_parameters(kernel, **value_dict): + """Fix the values of the arguments to specific constants. + + *value_dict* consists of *name*/*value* pairs, where *name* will be fixed + to be *value*. *name* may refer to :ref:`domain-parameters` or + :ref:`arguments`. + """ + assert isinstance(kernel, LoopKernel) + + # FIXME: Parameter / argument terminology? + + # FIXME: Is _remove the right approach? (I'm not sure it is.) Because of + # the potential namespace conflict. If yes, document. If no, fix. + + remove_arg = value_dict.pop("_remove", True) + within = value_dict.pop("within", None) + + for name, value in value_dict.items(): + kernel = _fix_parameter(kernel, name, value, remove_arg, within) + + return kernel + # }}} # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index cefed807d73bd0a9064c170190a3ba19b2d5abf6..438c07339b217f21d3e60c4f2f87050ea5b2d0d7 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -27,6 +27,8 @@ from loopy.symbolic import (get_dependencies, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import numpy as np from pymbolic import var @@ -255,9 +257,9 @@ class _not_provided: # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, callables_table, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -352,6 +354,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(precompute(kernel[kernel_names[0]], + subst_use, sweep_inames, within, storage_axes, temporary_name, + precompute_inames, precompute_outer_inames, storage_axis_to_tag, + default_tag, dtype, fetch_bounding_box, temporary_address_space, + compute_insn_id, kernel.callables_table, **kwargs)) # {{{ unify temporary_address_space / temporary_scope @@ -1030,15 +1044,34 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + new_callables = {} + + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = precompute_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 0a9cfb7bce21a64cc2858e4f3b9472e2992984b8..884e17f776ccb3d81a7c33ba195c2dcb5c7debfd 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -61,7 +61,7 @@ class LivenessAnalysis: def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -232,8 +232,9 @@ class TemporarySaver: def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, callables_table): self.kernel = kernel + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -436,7 +437,8 @@ class TemporarySaver: return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -623,7 +625,7 @@ class TemporarySaver: kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -717,7 +719,7 @@ class TemporarySaver: # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(kernel): +def save_and_reload_temporaries(program, entrypoint=None): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -740,13 +742,28 @@ def save_and_reload_temporaries(kernel): :returns: The resulting kernel """ - liveness = LivenessAnalysis(kernel) - saver = TemporarySaver(kernel) + if entrypoint is None: + if len(program.entrypoints) != 1: + raise LoopyError("Missing argument 'entrypoint'.") + entrypoint = list(program.entrypoints)[0] + + knl = program[entrypoint] + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program[entrypoint], + program.callables_table) + + assert knl.schedule is not None + + liveness = LivenessAnalysis(knl) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - for sched_idx, sched_item in enumerate(kernel.schedule): + for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into @@ -757,8 +774,9 @@ def save_and_reload_temporaries(kernel): else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_read_in_subkernel(kernel, subkernel) - | temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, + subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {} at entry of {}" @@ -766,20 +784,20 @@ def save_and_reload_temporaries(kernel): saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): - if sched_idx == len(kernel.schedule) - 1: + if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {} before return of {}" .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 565c69a49d07c92311f750e4a8fce0db91ff9bb2..066cf326cc4f7ea45e693ea1f48910dbe5747ad1 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -28,6 +28,8 @@ from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var +from loopy.program import iterate_over_kernels_if_given_program, Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -51,6 +53,16 @@ def extract_subst(kernel, subst_name, template, parameters=()): unifications. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(extract_subst(kernel[kernel_names[0]], + subst_name, template, parameters)) + if isinstance(template, str): from pymbolic import parse template = parse(template) @@ -188,6 +200,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): instructions=new_insns, substitutions=new_substs) + # }}} @@ -275,6 +288,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program @remove_any_newly_unused_inames def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): @@ -458,6 +472,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@iterate_over_kernels_if_given_program def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions @@ -466,6 +481,7 @@ def expand_subst(kernel, within=None): :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + if not kernel.substitutions: return kernel @@ -498,8 +514,17 @@ def find_rules_matching(kernel, pattern): return [r for r in kernel.substitutions if pattern.match(r)] -def find_one_rule_matching(kernel, pattern): - rules = find_rules_matching(kernel, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 787966efc7fd00ad282e60990846ce07004e7906..ee1ddf33d72adf405a84bf02a7d259f0eb2d66a5 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -29,6 +29,14 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext, SubArrayRef) +from pymbolic.primitives import Variable, Subscript, Lookup +from loopy.program import CallablesInferenceContext, make_clbl_inf_ctx import logging logger = logging.getLogger(__name__) @@ -40,10 +48,152 @@ def _debug(kernel, s, *args): logger.debug(f"{kernel.name}: {logstr}") +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = {id: dtype for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)} + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super().__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super().map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} + ) + else: + return super().map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, clbl_inf_ctx, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -52,10 +202,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(clbl_inf_ctx, CallablesInferenceContext) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.clbl_inf_ctx = clbl_inf_ctx + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -88,13 +241,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, clbl_inf_ctx=None): + if clbl_inf_ctx is None: + clbl_inf_ctx = self.clbl_inf_ctx + return type(self)(self.kernel, clbl_inf_ctx, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.clbl_inf_ctx, new_ass) @staticmethod def combine(dtype_sets): @@ -250,14 +406,23 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + + from pymbolic.primitives import Variable, CallWithKwargs, Call + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): - identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] + if not isinstance(identifier, ResolvedFunction): + # function not resolved => exit + return [] + + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name def none_if_empty(d): if d: @@ -266,25 +431,83 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())} - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] + # specializing the known function wrt type + in_knl_callable = self.clbl_inf_ctx[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable, self.clbl_inf_ctx = ( + in_knl_callable.with_types( + arg_id_to_dtype, + self.clbl_inf_ctx)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.clbl_inf_ctx, new_function_id = ( + self.clbl_inf_ctx.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -352,11 +575,20 @@ class TypeInferenceMapper(CombineMapper): def map_comparison(self, expr): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. + self(expr.left, return_tuple=False, return_dtype_set=False) + self(expr.right, return_tuple=False, return_dtype_set=False) + return [NumpyType(np.dtype(np.int32))] + + def map_logical_not(self, expr): + return [NumpyType(np.dtype(np.int32))] + + def map_logical_and(self, expr): + for child in expr.children: + self.rec(child) + return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + map_logical_or = map_logical_and def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] @@ -393,20 +625,112 @@ class TypeInferenceMapper(CombineMapper): rec_results = self.rec(expr.expr) if return_tuple: - return [expr.operation.result_dtypes(self.kernel, *rec_result) + return [expr.operation.result_dtypes(*rec_result) for rec_result in rec_results] else: - return [expr.operation.result_dtypes(self.kernel, rec_result)[0] + return [expr.operation.result_dtypes(rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.subscript) + +# }}} + + +# {{{ TypeReader + +class TypeReader(TypeInferenceMapper): + def __init__(self, kernel, callables, new_assignments={}): + self.kernel = kernel + self.callables = callables + self.new_assignments = new_assignments + + # {{{ disabled interface + + def copy(self, *args, **kwargs): + raise ValueError("Not allowed in TypeReader") + + # }}} + + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, self.callables, new_ass) + + def map_call(self, expr, return_tuple=False): + identifier = expr.function + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.callables[expr.function.name] + + arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in arg_id_to_dtype and arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(arg_id_to_dtype)] + else: + return [arg_id_to_dtype[-1]] + + return [] + + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return [self.kernel.index_dtype] + + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + + if result is not None: + result_dtype, _ = result + return [result_dtype] + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, (KernelArgument, TemporaryVariable)): + assert obj.dtype is not lp.auto + result = [obj.dtype] + if result[0] is None: + raise DependencyTypeInferenceFailure( + ", ".join(sorted(expr.name))) + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + + map_call_with_kwargs = map_call + # }}} # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.clbl_inf_ctx) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +775,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.clbl_inf_ctx) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.clbl_inf_ctx) # }}} @@ -482,7 +810,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +872,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, clbl_inf_ctx, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +882,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -575,10 +906,15 @@ def infer_unknown_types(kernel, expect_completion=False): item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) - - result, symbols_with_unavailable_types = ( - _infer_var_type( - kernel, item.name, type_inf_mapper, subst_expander)) + try: + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, clbl_inf_ctx) = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + except DependencyTypeInferenceFailure: + result = tuple() + type_inf_mapper = type_inf_mapper.copy( + clbl_inf_ctx=clbl_inf_ctx) failed = not result if not failed: @@ -597,6 +933,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -609,14 +946,10 @@ def infer_unknown_types(kernel, expect_completion=False): " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. - break + debug("could not determine type of '%s'%s" + % (item.name, advice)) + # We're done here + break # remember that this item failed failed_names.add(item.name) @@ -624,7 +957,6 @@ def infer_unknown_types(kernel, expect_completion=False): if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) - assert not expect_completion break # can't infer type yet, put back into queue @@ -635,23 +967,134 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, + return_tuple=len(insn.assignees) != 1, + return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + clbl_inf_ctx = type_inf_mapper.clbl_inf_ctx + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + return type_specialized_kernel, clbl_inf_ctx + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + from loopy.kernel.data import auto + from loopy.program import resolve_callables + + program = resolve_callables(program) + + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + renamed_entrypoints = set() + + for e in program.entrypoints: + logger.debug(f"Entering entrypoint: {e}") + arg_id_to_dtype = {arg.name: arg.dtype for arg in + program[e].args if arg.dtype not in (None, auto)} + new_callable, clbl_inf_ctx = program.callables_table[e].with_types( + arg_id_to_dtype, clbl_inf_ctx) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) + renamed_entrypoints.add(new_name.name) + + if expect_completion: + from loopy.types import LoopyType + new_knl = new_callable.subkernel + + args_not_inferred = {arg.name + for arg in new_knl.args + if not isinstance(arg.dtype, LoopyType)} + + tvs_not_inferred = {tv.name + for tv in new_knl.temporary_variables.values() + if not isinstance(tv.dtype, LoopyType)} + + vars_not_inferred = tvs_not_inferred | args_not_inferred + + if vars_not_inferred: + if expect_completion: + raise LoopyError("could not determine type of" + f" '{vars_not_inferred.pop()}' of kernel '{e}'.") + + return clbl_inf_ctx.finish_program(program, renamed_entrypoints) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeReader(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -676,7 +1119,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = expr.operation.result_dtypes(*arg_dtypes) reduction_dtypes = tuple( dt.with_target(kernel.target) if dt is not lp.auto else dt diff --git a/loopy/types.py b/loopy/types.py index de6208476e270eb5aab2595f05def4f771bcf901..2457049073eab8c73202e324514526097b56c4d1 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -193,6 +193,45 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} +# {{{ + +class OpaqueType(LoopyType): + """An opaque data type is truly opaque - it has no allocations, no + temporaries of that type, etc. The only thing allowed is to be pass in + through one ValueArg and go out to another. It is introduced to accomodate + functional calls to external libraries. + """ + def __init__(self, name): + assert isinstance(name, str) + self.name = name + self.target = None + + def is_integral(self): + return False + + def is_complex(self): + return False + + def involves_complex(self): + return False + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.name) + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.name == other.name) + + def __ne__(self, other): + return not self.__eq__(other) + +# }}} + + def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto diff --git a/setup.py b/setup.py index 57f5e895d0fdd39e3be7e521c23c1ad3324da08e..4f56bc367f08308d46cdfde4e9ee0efa6f1f7ccf 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ setup(name="loopy", "codepy>=2017.1", "colorama", "Mako", + "pyrsistent", ], extras_require={ diff --git a/test/library_for_test.py b/test/library_for_test.py index 2cb4067e0acd6f4a88ff166e0fd460ec925585f2..cfaacdc0ef2df0a76209398dac1cde7a40a1b336 100644 --- a/test/library_for_test.py +++ b/test/library_for_test.py @@ -1,23 +1,61 @@ -# This exists because function handles can't be pickled. +import loopy as lp +import numpy as np -def no_ret_f_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class NoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + if len(arg_id_to_dtype) != 0: + raise RuntimeError("'f' cannot take any inputs.") - if (name == "f" and len(arg_dtypes) == 0): - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="f", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") -def no_ret_f_preamble_gen(preamble_info): - yield ("10_define_f", - r""" - void f() - { - printf("Hi!\n"); - } - """) + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + yield ("10_define_f", + r""" + void f() + { + printf("Hi!\n"); + } + """) + + +class SingleArgNoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + input_dtype = arg_id_to_dtype.get(0) + if input_dtype is None: + return self, callables + + if input_dtype.numpy_dtype != np.float32: + raise RuntimeError("'f' only supports f32.") + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") + + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + + yield ("10_define_f", + r""" + void f(float x) + { + printf("Hi!\n"); + } + """) diff --git a/test/test_apps.py b/test/test_apps.py index 56f4127ac6be827afda8bd41b6e87ee6d5e774dc..6e49e73fafae569411ad68fb8fefd24b5315087f 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -217,7 +217,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -225,13 +226,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -297,7 +297,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -311,14 +312,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -661,9 +662,10 @@ def test_domain_tree_nesting(): TV("num_vals_offset", initializer=num_vals_offset, read_only=True, address_space=AS.PRIVATE), lp.GlobalArg("B", shape=(100, 31), dtype=np.float64), - lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)]) + lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)], + name="nested_domain") - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl["nested_domain"].parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index a204859fff57e4806ac9ebd8204acded021512ac..1c79241cfe4f78f574655c230fa1c393d2c4b51e 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -111,11 +111,12 @@ def test_c_target_strides_nonsquare(): lp.GlobalArg("a", np.float32, shape=sizes, order=order), "..." ], - target=ExecutableCTarget()) + target=ExecutableCTarget(), + name="nonsquare_strides") # test with C-order knl = __get_kernel("C") - a_lp = next(x for x in knl.args if x.name == "a") + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a") a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order="C") @@ -125,7 +126,7 @@ def test_c_target_strides_nonsquare(): # test with F-order knl = __get_kernel("F") - a_lp = next(x for x in knl.args if x.name == "a") + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a") a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order="F") diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 0000000000000000000000000000000000000000..ef22b163294793d86478a2fa0e3a913cfdeb6382 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,768 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import Log2Callable + + x = np.random.rand(10) + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_callable(prog, "log2", Log2Callable("log2")) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_function( + "{[i, j]:0<= i, j< 4}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, name="linear_combo1") + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """, name="linear_combo2") + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name="x, y", + dtype=np.float64, + shape=(n, n, n, n, n)), + ...] + ) + + knl = lp.merge([grandchild_knl, child_knl, parent_knl]) + + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo2") + knl = lp.inline_callable_kernel(knl, "linear_combo1") + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<4}", + """ + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name="x, y, z", + dtype=np.float64, + shape=(n, n, n, n, n)), + ...] + ) + + knl = lp.merge([parent_knl, child_knl]) + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo") + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 4 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_function( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg("f, e, h, g"), ...], + name="linear_combo") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.merge([caller_knl, callee_knl]) + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo") + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, name="caller") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.merge([caller_knl, callee_knl]) + + knl = lp.set_options(knl, "return_dict") + + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo") + + evt, out = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out["z"].get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_function( + "{[i]: 0<=i<6}", + """ + b[i] = 2*abs(a[i]) + """, name="callee_fn1") + + callee2 = lp.make_function( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + b[i, j] = 3*a[i, j] + """, name="callee_fn2") + + callee3 = lp.make_function( + "{[i]: 0<=i<6}", + """ + b[i] = 5*a[i] + """, name="callee_fn3") + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.merge([knl, callee1]) + knl = lp.merge([knl, callee2]) + knl = lp.merge([knl, callee3]) + + if inline: + knl = lp.inline_callable_kernel(knl, "callee_fn1") + knl = lp.inline_callable_kernel(knl, "callee_fn2") + knl = lp.inline_callable_kernel(knl, "callee_fn3") + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict["y1"].get() + y2 = out_dict["y2"].get() + y3 = out_dict["y3"].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_function( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")], + [ + lp.GlobalArg("a"), + lp.GlobalArg("acc_i, index", is_input=False, is_output=True, + shape=lp.auto), + ...], + name="custom_argmin") + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_diff.py b/test/test_diff.py index 8af2a2b057a52ef6e122ffa65caf85d777ccbbb1..c1bfd9093a09cd9c1f265eb5895b3c677bdb37bf 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -58,12 +58,15 @@ def test_diff(ctx_factory): """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) - """) + """, name="diff") knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel - dknl, diff_map = diff_kernel(knl, "z", "x") + #FIXME Is this the correct interface. Does it make sense to take the entire + #translation unit? + dknl, diff_map = diff_kernel(knl["diff"], "z", "x") + dknl = knl.with_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index 6a0d9f255faefc1e1e3e8fbd8c8f745b058ff1b9..03f1bbc2f538b03af8e7beb6b69d4132c99448e9 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -56,20 +56,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -85,16 +80,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -113,16 +106,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -145,12 +134,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -174,14 +161,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -206,25 +192,22 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") - assert knl.parents_per_domain()[1] == 0 + assert knl["loopy_kernel"].parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -280,17 +263,17 @@ def test_independent_multi_domain(ctx_factory): lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + name="loopy_kernel") knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl["loopy_kernel"].parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -394,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j 1e-15 + assert abs_err < 1e-6 + + def test_fill(ctx_factory): fortran_src = """ subroutine fill(out, a, n) @@ -58,18 +149,18 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = [fill] + ! RESULT = fill ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src, + knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") - assert "i_inner" in knl.all_inames() + assert "i_inner" in knl["fill"].all_inames() ctx = ctx_factory() @@ -90,7 +181,7 @@ def test_fill_const(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -113,7 +204,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -137,7 +228,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -164,7 +255,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -192,15 +283,15 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl - assert "a" in knl.temporary_variables + assert "a" in knl["fill"].temporary_variables knl = lp.assignment_to_subst(knl, "a") - assert "a" not in knl.temporary_variables + assert "a" not in knl["fill"].temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -229,7 +320,7 @@ def test_if(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -263,7 +354,7 @@ def test_tagged(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -297,34 +388,34 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + prog = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(prog["dgemm"].domains) == 1 - ref_knl = knl + ref_prog = prog - knl = lp.split_iname(knl, "i", 16, + prog = lp.split_iname(prog, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 8, + prog = lp.split_iname(prog, "j", 8, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 32) - knl = lp.assume(knl, "n mod 32 = 0") - knl = lp.assume(knl, "m mod 32 = 0") - knl = lp.assume(knl, "ell mod 16 = 0") - - knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") - knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + prog = lp.split_iname(prog, "k", 32) + prog = lp.assume(prog, "n mod 32 = 0") + prog = lp.assume(prog, "m mod 32 = 0") + prog = lp.assume(prog, "ell mod 16 = 0") + + prog = lp.extract_subst(prog, "a_acc", "a[i1,i2]", parameters="i1, i2") + prog = lp.extract_subst(prog, "b_acc", "b[i1,i2]", parameters="i1, i2") + prog = lp.precompute(prog, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + prog = lp.precompute(prog, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") - knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, + prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) + lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128)) @pytest.mark.xfail @@ -362,7 +453,7 @@ def test_batched_sparse(): """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -406,18 +497,19 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv, = lp.parse_fortran( + xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv, = lp.parse_fortran( + yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv, = lp.parse_fortran( + xyderiv = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) - knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) - knl = lp.prioritize_loops(knl, "e,i,j,k") + knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]), + data_flow=[("result", 0, 1)]) + knl = knl.with_kernel(lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k")) - assert len(knl.temporary_variables) == 2 + assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) @@ -449,15 +541,17 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! fill, twice = lp.parse_fortran(SOURCE) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = [knl] + ! RESULT = knl ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src) + lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -477,9 +571,9 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl["dgemm"].domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") @@ -507,6 +601,53 @@ def test_precompute_some_exist(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) +def test_fortran_subroutines(): + fortran_src = """ + subroutine twice(n, a) + implicit none + real*8 a(n) + integer i,n + + do i=1,n + a(i) = a(i) * 2 + end do + end subroutine + + subroutine twice_cross(n, a, i) + implicit none + integer i, n + real*8 a(n,n) + + call twice(n, a(1:n, i)) + call twice(n, a(i, 1:n)) + end subroutine + """ + prg = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") + print(lp.generate_code_v2(prg).device_code()) + + +def test_domain_fusion_imperfectly_nested(): + fortran_src = """ + subroutine imperfect(n, m, a, b) + implicit none + integer i, j, n, m + real a(n), b(n,n) + + do i=1, n + a(i) = i + do j=1, m + b(i,j) = i*j + end do + end do + end subroutine + """ + + prg = lp.parse_fortran(fortran_src) + # If n > 0 and m == 0, a single domain would be empty, + # leading (incorrectly) to no assignments to 'a'. + assert len(prg["imperfect"].domains) > 1 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_loopy.py b/test/test_loopy.py index c5295397552c0462da06ff126814f456e3bdcc6e..1e728eefb9ecc6430438992bb23f34877635a6f0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -92,7 +92,7 @@ def test_complicated_subst(ctx_factory): print(knl) - sr_keys = list(knl.substitutions.keys()) + sr_keys = list(knl["loopy_kernel"].substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), @@ -102,8 +102,10 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many -def test_type_inference_no_artificial_doubles(): - knl = lp.make_kernel( +def test_type_inference_no_artificial_doubles(ctx_factory): + ctx = ctx_factory() + + prog = lp.make_kernel( "{[i]: 0<=i bb = a[i] - b[i] @@ -115,16 +117,15 @@ def test_type_inference_no_artificial_doubles(): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -136,13 +137,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog["loopy_kernel"].temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog["loopy_kernel"].temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog["loopy_kernel"].temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog["loopy_kernel"].temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -176,16 +181,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -196,17 +197,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -218,17 +216,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -240,17 +235,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_bare_data_dependency(ctx_factory): @@ -280,7 +272,9 @@ def test_bare_data_dependency(ctx_factory): # {{{ test race detection -def test_ilp_write_race_detection_global(): +def test_ilp_write_race_detection_global(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17) -def test_ilp_write_race_avoidance_private(): +def test_ilp_write_race_avoidance_private(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,) # }}} @@ -354,11 +356,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -379,10 +382,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -395,10 +399,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -413,16 +418,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -438,11 +443,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -459,10 +464,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -494,9 +500,9 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - #print(cknl.get_highlighted_code({"a": a.dtype})) - knl = lp.set_options(knl, write_cl=True) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + print(lp.generate_code_v2(knl)) knl(queue, a=a, b=b) import numpy.linalg as la @@ -514,18 +520,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -546,18 +550,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -598,10 +604,11 @@ def test_inames_deps_from_write_subscript(ctx_factory): [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), - "..."]) + "..."], + name="loopy_kernel") - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog["loopy_kernel"].insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -615,14 +622,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -770,11 +775,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -854,9 +855,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @@ -980,7 +979,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2274,10 +2264,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) @@ -2287,7 +2278,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel("{[i]: 0 <= i < 10}", + prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ for i a[i] = i {id=a} @@ -2302,24 +2293,26 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, "i", vecsize, inner_tag="l.0") + prog = lp.split_iname(prog, "i", vecsize, inner_tag="l.0") from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): - from loopy.type_inference import TypeInferenceMapper + from loopy.type_inference import TypeReader from loopy.library.reduction import SegmentedSumReductionOperation from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog["loopy_kernel"].get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2481,7 +2477,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == {"n"} + assert knl["loopy_kernel"].all_params() == {"n"} def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2500,7 +2496,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2513,11 +2509,15 @@ def test_wildcard_dep_matching(): all_insns = {"insn%d" % i for i in range(1, 6)} - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - {"insn2"} - assert knl.id_to_insn["insn3"].depends_on == all_insns - {"insn3"} - assert knl.id_to_insn["insn4"].depends_on == {"insn1", "insn2"} - assert knl.id_to_insn["insn5"].depends_on == all_insns - {"insn1", "insn5"} + assert prog["loopy_kernel"].id_to_insn["insn1"].depends_on == set() + assert (prog["loopy_kernel"].id_to_insn["insn2"].depends_on == all_insns - + {"insn2"}) + assert (prog["loopy_kernel"].id_to_insn["insn3"].depends_on == all_insns - + {"insn3"}) + assert (prog["loopy_kernel"].id_to_insn["insn4"].depends_on == {"insn1", + "insn2"}) + assert (prog["loopy_kernel"].id_to_insn["insn5"].depends_on == all_insns - + {"insn1", "insn5"}) def test_preamble_with_separate_temporaries(ctx_factory): @@ -2581,12 +2581,14 @@ def test_preamble_with_separate_temporaries(ctx_factory): def test_arg_inference_for_predicates(): - knl = lp.make_kernel("{[i]: 0 <= i < 10}", + prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ if incr[i] a = a + 1 end - """) + """, name="loopy_kernel") + + knl = prog["loopy_kernel"] assert "incr" in knl.arg_dict assert knl.arg_dict["incr"].shape == (10,) @@ -2611,7 +2613,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=n {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ @@ -945,10 +1004,25 @@ def test_barrier_counter_barriers(): m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} - barrier_count = sync_map["barrier_local"].eval_with_dict(params) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 +def test_barrier_count_single(): + knl = lp.make_kernel( + "{[i]: 0<=i<128}", + """ + <> c[i] = 15*i {id=yoink} + c[i+1] = c[i] {dep=yoink} + """) + + knl = lp.tag_inames(knl, {"i": "l.0"}) + sync_map = lp.get_synchronization_map(knl) + print(sync_map) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum() + assert barrier_count == 1 + + def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( @@ -975,21 +1049,21 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 - assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize + assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 + assert sync_map.filter_by(kind="barrier_local").eval_and_sum(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, "mul", CG.SUBGROUP) + lp.Op(np.float32, "mul", CG.SUBGROUP, "matmul") ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, "add", CG.SUBGROUP) + lp.Op(np.float32, "add", CG.SUBGROUP, "matmul") ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, "add", CG.SUBGROUP) + lp.Op(np.int32, "add", CG.SUBGROUP, "matmul") ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP) + lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP, "matmul") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1002,13 +1076,15 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable("ell")}, gid_strides={1: bsize}, direction="load", variable="b", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={0: 1, 1: Variable("m")}, gid_strides={0: Variable("m")*bsize}, direction="load", - variable="a", count_granularity=CG.WORKITEM) + variable="a", count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -1018,7 +1094,8 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable("ell")}, gid_strides={0: Variable("ell")*bsize, 1: bsize}, direction="store", variable="c", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32coal == n*ell @@ -1037,14 +1114,16 @@ def test_all_counters_parallel_matmul(): lid_strides={1: 16}, gid_strides={}, variable="a_fetch", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), direction="load", lid_strides={0: 1}, gid_strides={}, variable="b_fetch", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1093,9 +1172,8 @@ def test_floor_div_coefficient_collector(): n_subgroups = n_workgroups*subgroups_per_group # count local f32 accesses - f32_local = lp.get_mem_access_map( - knl, count_redundant_work=True, subgroup_size=SGS - ).filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) + m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) + f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert f32_local == 2*(rept+1)*n_subgroups @@ -1133,7 +1211,8 @@ def test_mem_access_tagged_variables(): gid_strides={1: bsize}, direction="load", variable="b", variable_tag="mmbload", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={1: Variable("m")}, @@ -1141,7 +1220,8 @@ def test_mem_access_tagged_variables(): direction="load", variable="a", variable_tag="mmaload", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) assert f32s1lb == n*m*ell @@ -1154,7 +1234,8 @@ def test_mem_access_tagged_variables(): gid_strides={0: Variable("ell")*bsize, 1: bsize}, direction="store", variable="c", variable_tag="mmresult", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32coal == n*ell @@ -1319,6 +1400,85 @@ def test_strided_footprint(): assert 2*num < denom +def test_stats_on_callable_kernel(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{:}", + """ + y[:] = matvec20x20(A[:,:], x[:]) + """, + [ + lp.GlobalArg("x,y", shape=(20,), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matvec") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 400 + + +def test_stats_on_callable_kernel_within_loop(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec20x20(A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + +def test_callable_kernel_with_substitution(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< n}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, + [lp.ValueArg("n"), ...], + name="matvec") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec(20, A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + def test_no_loop_ops(): # See https://github.com/inducer/loopy/issues/211 diff --git a/test/test_target.py b/test/test_target.py index e6a93299143c399aebd2f5025adb90238aea0c5a..0fb386998e775b3603a099d997ba1ff78bf85af8 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -70,9 +70,7 @@ def test_ispc_target(occa_mode=False): knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) + codegen_result = lp.generate_code_v2(knl) print(codegen_result.device_code()) print(codegen_result.host_code()) @@ -96,9 +94,8 @@ def test_cuda_target(): default_tag="l.auto") print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) + lp.generate_code_v2( + knl).device_code()) def test_generate_c_snippet(): @@ -138,10 +135,7 @@ def test_generate_c_snippet(): knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") - - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) + print(lp.generate_code_v2(knl)) @pytest.mark.parametrize("target", [CTarget, OpenCLTarget]) @@ -354,8 +348,7 @@ def test_ispc_streaming_stores(): knl = lp.set_argument_order(knl, vars + ["n"]) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + lp.generate_code_v2(knl).all_code() assert "streaming_store(" in lp.generate_code_v2(knl).all_code() diff --git a/test/test_transform.py b/test/test_transform.py index cad5d776a748fc8b42c9b1d9e950e91523d4c2cd..9ac29766bfb8f7f887455cfd1cb123af9ff4915c 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -148,7 +148,7 @@ def test_to_batched_temp(ctx_factory): bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced - assert bknl.temporary_variables["cnst"].shape == () + assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) @@ -253,18 +253,17 @@ def test_vectorize(ctx_factory): a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) - knl = lp.set_array_dim_names(knl, "a,b", "i") + knl = lp.set_array_axis_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) - knl = lp.tag_data_axes(knl, "a,b", "c,vec") + knl = lp.tag_array_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( @@ -273,19 +272,19 @@ def test_vectorize(ctx_factory): def test_extract_subst(ctx_factory): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=itmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -488,30 +490,36 @@ def test_add_nosync(): <>tmp5[i] = 0 {id=insn5,groups=g1} tmp5[i] = 1 {id=insn6,conflicts=g1} - """) + """, name="nosync") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog["nosync"].id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog["nosync"].id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog["nosync"].id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog["nosync"].id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -520,28 +528,30 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", [], name="lpy_knl") + new_root_kernel = prog["lpy_knl"].copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = {insn.id for insn in knl.instructions} + insn_ids = {insn.id for insn in prog["lpy_knl"].instructions} assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} - """) + """, name="splitter") - knl = lp.split_iname(knl, "i", 4, within="id:to_split") + prog = lp.split_iname(prog, "i", 4, within="id:to_split") - for insn in knl.instructions: + for insn in prog["splitter"].instructions: if insn.id == "to_split": assert insn.within_inames == frozenset({"i_outer", "i_inner"}) if insn.id == "not_to_split": @@ -552,7 +562,7 @@ def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp - ref_knl = lp.make_kernel( + ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x @@ -562,10 +572,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - knl = lp.expand_subst(ref_knl) - assert not knl.substitutions + prg = lp.expand_subst(ref_prg) + assert not any( + cknl.subkernel.substitutions + for cknl in prg.callables_table.values()) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_prg, ctx, prg) def test_extract_subst_with_iname_deps_in_templ(ctx_factory): @@ -658,12 +670,12 @@ def test_add_inames_for_unused_hw_axes(ctx_factory): knl = lp.add_inames_for_unused_hw_axes(knl) - assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) - assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) - assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) + assert (knl["rank_one"].id_to_insn["init_alpha"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) + assert (knl["rank_one"].id_to_insn["a_fetch_rule"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) + assert (knl["rank_one"].id_to_insn["b_fetch_rule"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], @@ -723,12 +735,13 @@ def test_rename_argument_with_assumptions(): knl = lp.assume(knl, "n_old=10") knl = lp.rename_argument(knl, "n_old", "n_new") + assumptions = knl["loopy_kernel"].assumptions - assert "n_old" not in knl.assumptions.get_var_dict() - assert "n_new" in knl.assumptions.get_var_dict() + assert "n_old" not in assumptions.get_var_dict() + assert "n_new" in assumptions.get_var_dict() assert ( - (knl.assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) - == knl.assumptions) + (assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) + == assumptions) def test_tag_iname_with_match_pattern(): @@ -740,6 +753,7 @@ def test_tag_iname_with_match_pattern(): """) knl = lp.tag_inames(knl, "i*:unr") + knl = knl["loopy_kernel"] i0_tag, = knl.inames["i0"].tags i1_tag, = knl.inames["i1"].tags @@ -765,6 +779,7 @@ def test_custom_iname_tag(): """) knl = lp.tag_inames(knl, {"ifuzz0": ElementLoopTag(), "ifuzz1": DOFLoopTag()}) + knl = knl["loopy_kernel"] ifuzz0_tag, = knl.inames["ifuzz0"].tags ifuzz1_tag, = knl.inames["ifuzz1"].tags diff --git a/test/testlib.py b/test/testlib.py index 35d51f72d2d7cf08dc5b92c8377c9c1578509e6d..7009e8f5aa2caba96d83ba9bd5f8f700a75b7e4a 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,8 +9,9 @@ class GridOverride: self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, callables_table, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -131,4 +133,42 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, callables_table): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + + if dtype.type == np.float32: + name_in_target = "log2f" + elif dtype.type == np.float64: + name_in_target = "log2" + pass + else: + raise TypeError(f"log2: unexpected type {dtype}") + + from loopy.types import NumpyType + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + +# }}} + # vim: foldmethod=marker