diff --git a/doc/index.rst b/doc/index.rst index b77bbb16f413defe5010c75d28464051553b4486..9a10116d916468fd46b9b23ad113f3d9085ae699 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 0000000000000000000000000000000000000000..5a59e84282119209cc89eb18e3a4eda97725edf0 --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,193 @@ +Calling Loopy Kernels and External Functions +============================================ + +Goals of a function interface +----------------------------- + +- *FIXME: * Needs to change after the new design of program. + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ResolvedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". + +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a +:attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ResolvedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ResolvedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ResolvedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ResolvedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1b017f701f8161e93c4fdc1c14644dfe4b4fa74c..55d98ac0ede2f808e1a8984de6e58a2421c7edd1 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -334,7 +334,7 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl.root_kernel.stringify(with_dependencies=True)) --------------------------------------------------------------------------- KERNEL: loopy_kernel --------------------------------------------------------------------------- @@ -1178,7 +1178,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1206,8 +1206,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_linearized_kernel`: - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_linearized_kernel(prog.root_kernel, prog.callables_table) + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1236,10 +1238,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_linearized_kernel(knl) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_linearized_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1278,7 +1280,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -1578,12 +1580,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1640,15 +1642,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1683,13 +1685,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1707,13 +1709,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None, None) : ... - MemAccess(None, None, None, None, store, None, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1750,12 +1752,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1765,13 +1767,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1791,12 +1793,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1805,13 +1807,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1845,14 +1847,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - kernel_launch : { 1 } + Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1905,8 +1907,8 @@ count the barriers using :func:`loopy.get_synchronization_map`: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - barrier_local : { 1000 } - kernel_launch : { 1 } + Sync(barrier_local, loopy_kernel) : { 1000 } + Sync(kernel_launch, loopy_kernel) : { 1 } Based on the kernel code printed above, we would expect each work-item to diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 0000000000000000000000000000000000000000..c13d99bd06295096c26d6e113841c853f80645fc --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,111 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget + + +# {{{ blas callable + +class BLASCallable(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + for i in range(0, 2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + mat_dtype = arg_id_to_dtype[0].numpy_dtype + vec_dtype = arg_id_to_dtype[1].numpy_dtype + + if mat_dtype != vec_dtype: + raise LoopyError("DGEMV should have same dtype for matrix and " + "vector") + + if vec_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV only supported for float32 and float64 " + "types") + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), + -1: NumpyType(vec_dtype)}), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + + parameters.append(insn.assignees[0]) + par_dtypes.append(self.arg_id_to_dtype[-1]) + + # no type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + mat_descr = self.arg_id_to_descr[0] + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + c_parameters.insert(0, var('CblasRowMajor')) + c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(2, mat_descr.shape[0]) + c_parameters.insert(3, mat_descr.shape[1]) + c_parameters.insert(4, 1) + c_parameters.insert(6, 1) + c_parameters.insert(8, 1) + c_parameters.insert(10, 1) + return var(self.name_in_target)(*c_parameters), False + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + + +def blas_fn_lookup(target, identifier): + if identifier == 'gemv': + return BLASCallable(name='gemv') + return None + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.GlobalArg('A', dtype=np.float64, shape=(n, n)), + lp.GlobalArg('x', dtype=np.float64, shape=(n, )), + lp.GlobalArg('y', shape=(n, )), ...], + target=CTarget(), + lang_version=(2018, 2)) + +knl = lp.register_function_id_to_in_knl_callable_mapper( + knl, blas_fn_lookup) + +print(lp.generate_code_v2(knl).device_code()) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1906f703b0efc39808ff68a63b91ff37..be22e268c85fe985a98763426faf8cfadf73c5fb 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -1,7 +1,5 @@ import numpy as np import loopy as lp -import pyopencl as cl -import pyopencl.array knl = lp.make_kernel( "{ [i,k]: 0<=i max_test_kernel_count: - break - kernel = infer_unknown_types(kernel, expect_completion=True) - - compiled = CompiledKernel(ctx, kernel) + test_prog = infer_unknown_types(test_prog, expect_completion=True) + test_prog_codegen_result = lp.generate_code_v2(test_prog) + + args = make_args(test_prog, + test_prog_codegen_result.implemented_data_info, + queue, ref_arg_data, parameters) + args["out_host"] = False + + if not quiet: + print(75*"-") + print("Kernel:") + print(75*"-") + if print_code: + print(get_highlighted_code( + test_prog_codegen_result.device_code())) + print(75*"-") + if dump_binary: + print(type(test_prog_codegen_result.cl_program)) + print(test_prog_codegen_result.cl_program.binaries[0]) + print(75*"-") - kernel_info = compiled.kernel_info(frozenset()) + logger.info("%s: run warmup" % (test_prog.name)) - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) + for i in range(warmup_rounds): + if not AUTO_TEST_SKIP_RUN: + test_prog(queue, **args) - args["out_host"] = False + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - # {{{ find cl program + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - for name in dir(kernel_info.cl_kernels): - if name.startswith("__"): - continue - cl_kernel = getattr(kernel_info.cl_kernels, name) - cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) - break - else: - assert False, "could not find cl_program" + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - # }}} + need_check = False - print(type(cl_program)) - if hasattr(cl_program, "binaries"): - print(cl_program.binaries[0]) + events = [] + queue.finish() - print(75*"-") + logger.info("%s: warmup done" % (test_prog.name)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_prog.name)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = max(warmup_rounds, 1) - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = max(warmup_rounds, 1) + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_prog.name)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - - print("elapsed: %s s event, %s s marker-event %s s wall " - "(%d rounds)%s" % ( - format_float_or_none(elapsed_event), - format_float_or_none(elapsed_event_marker), - format_float_or_none(elapsed_wall), timing_rounds, rates)) - - if do_check: - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) - if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} diff --git a/loopy/check.py b/loopy/check.py index 0d2bbff7cf8d6f9e63a33dc2c8814f29afae70f0..b0cf68f08224c44f8e2186a80fd31e99b7f67ebf 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,12 +27,14 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel from loopy.type_inference import TypeInferenceMapper from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -59,6 +61,68 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ResolvedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_resolved(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unsupported instruction type %s." % type(insn).__name__) + # }}} @@ -79,9 +143,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -143,6 +207,19 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns an instance of :class:`set` of all the iname tags used in + *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + from itertools import chain + iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in + kernel.all_inames()))) + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -157,8 +234,10 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, callables_table): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -171,6 +250,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = callables_table[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: @@ -179,9 +273,11 @@ def check_for_inactive_iname_access(kernel): if not expression_inames <= kernel.insn_inames(insn): raise LoopyError( "instruction '%s' references " - "inames '%s' that the instruction does not depend on" + "inames '%s' that the instruction does not depend on in " + "the kernel '%s'" % (insn.id, - ", ".join(expression_inames - kernel.insn_inames(insn)))) + ", ".join(expression_inames - + kernel.insn_inames(insn)), kernel.name)) def _is_racing_iname_tag(tv, tag): @@ -417,11 +513,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " @@ -632,7 +729,15 @@ def check_variable_access_ordered(kernel): return if kernel.options.enforce_variable_access_ordered: - _check_variable_access_ordered_inner(kernel) + try: + _check_variable_access_ordered_inner(kernel) + except RuntimeError as e: + if isinstance(e.args[0], str) and ( + e.args[0].startswith('maximum recursion depth exceeded')): + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + else: + raise e else: from loopy.diagnostic import VariableAccessNotOrdered try: @@ -640,20 +745,27 @@ def check_variable_access_ordered(kernel): except VariableAccessNotOrdered as e: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) + except RuntimeError as e: + if isinstance(e.args[0], str) and ( + e.args[0].startswith('maximum recursion depth exceeded')): + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + else: + raise e # }}} # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + check_for_integer_subscript_indices(kernel, callables_table) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -681,7 +793,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -696,7 +809,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + callables_table) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -713,7 +827,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -764,9 +879,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table) # }}} @@ -916,15 +1032,19 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + # FIXME `check_for_unused_hw_axes_in_insns` currently flags a problem + # in the callee if a caller kernel, at a call site, uses hardware axes + # (say `g.0` and `g.1`). It does not seem that that knowledge is + # propagated to the callee. + # check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/cli.py b/loopy/cli.py index a92922b1845d76dd7a700a93c05de3eecf8c28dd..3dbdeb41e37aebc0e3c2b0b8b3fc68866dfec080 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -65,11 +65,9 @@ def main(): parser.add_argument("--target", choices=( "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") - parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") - parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() @@ -108,9 +106,11 @@ def main(): ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", + ".F90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", + ".F77": "fortran", }.get(ext) with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() @@ -161,10 +161,7 @@ def main(): raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") - if args.name is not None: - kernel = kernel.copy(name=args.name) - - kernels = [kernel] + prg = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None @@ -181,69 +178,31 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - kernels = lp.parse_transformed_fortran( + prg = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) - if args.name is not None: - kernels = [kernel for kernel in kernels - if kernel.name == args.name] - - if not kernels: - raise RuntimeError("no kernels found (name specified: %s)" - % args.name) - else: raise RuntimeError("unknown language: '%s'" % args.lang) + if not isinstance(prg, lp.Program): + # FIXME + assert isinstance(prg, list) # of kernels + raise NotImplementedError("convert list of kernels to Program") + if args.print_ir: - for kernel in kernels: - print(kernel, file=sys.stderr) - - if args.occa_add_dummy_arg: - new_kernels = [] - for kernel in kernels: - new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) - ] + kernel.args - new_kernels.append(kernel.copy(args=new_args)) - - kernels = new_kernels - del new_kernels - - codes = [] - from loopy.codegen import generate_code - for kernel in kernels: - kernel = lp.preprocess_kernel(kernel) - code, impl_arg_info = generate_code(kernel) - codes.append(code) + print(prg, file=sys.stderr) + + prg = lp.preprocess_kernel(prg) + cgr = lp.generate_code_v2(prg) if args.outfile is not None: outfile = args.outfile else: outfile = "-" - code = "\n\n".join(codes) - - # {{{ edit code if requested - - import os - edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") - need_edit = args.edit_code - if not need_edit and edit_kernel_env is not None: - # Do not replace with "any()"--Py2.6/2.7 bug doesn't like - # comprehensions in functions with exec(). - - for k in kernels: - if edit_kernel_env.lower() in k.name.lower(): - need_edit = True - - if need_edit: - from pytools import invoke_editor - code = invoke_editor(code, filename="edit.cl") - - # }}} + code = cgr.device_code() if outfile == "-": sys.stdout.write(code) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..70cd7cc956acdfdc59402851b081602ca78ce187 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -22,6 +22,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + import six from loopy.diagnostic import LoopyError, warn @@ -32,8 +35,14 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -import logging -logger = logging.getLogger(__name__) + +from loopy.symbolic import CombineMapper +from functools import reduce + +from loopy.kernel.function_interface import CallableKernel +from cgen import Collection + +from pytools import ProcessLogger # {{{ implemented data info @@ -146,6 +155,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -187,17 +197,23 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: callables_table + + An instance of :class:`loopy.CallablesTable`. """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + callables_table, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -206,6 +222,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.callables_table = callables_table self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -214,7 +231,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -224,6 +241,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -244,6 +264,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -253,6 +274,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + callables_table=self.callables_table, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -362,6 +384,32 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -374,19 +422,19 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, callables_table, target): """ :returns: a :class:`CodeGenerationResult` + + :param kernel: An instance of :class:`loopy.LoopKernel`. + :param callables_table: An instance of + :class:`loopy.CallablesTable`. """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -407,13 +455,10 @@ def generate_code_v2(kernel): # }}} - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, callables_table) - logger.info("%s: generate code: start" % kernel.name) + codegen_plog = ProcessLogger(logger, "%s: generate code" % kernel.name) # {{{ examine arg list @@ -427,13 +472,13 @@ def generate_code_v2(kernel): if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( - kernel.target, + target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, + target=target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, @@ -444,7 +489,8 @@ def generate_code_v2(kernel): allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.involves_complex(): + dtype = var.dtype + if dtype.involves_complex(): allow_complex = True # }}} @@ -456,6 +502,7 @@ def generate_code_v2(kernel): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), @@ -467,12 +514,14 @@ def generate_code_v2(kernel): var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( - kernel.target.host_program_name_prefix + target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + callables_table=callables_table) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -502,7 +551,7 @@ def generate_code_v2(kernel): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -516,7 +565,7 @@ def generate_code_v2(kernel): implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) - logger.info("%s: generate code: done" % kernel.name) + codegen_plog.done() if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) @@ -524,6 +573,79 @@ def generate_code_v2(kernel): return codegen_result +def generate_code_v2(program): + """ + Returns an instance of :class:`CodeGenerationResult`. + + :param program: An instance of :class:`loopy.Program`. + """ + from loopy.kernel import LoopKernel + from loopy.program import make_program + from cgen import FunctionBody + + if isinstance(program, LoopKernel): + program = make_program(program) + + from loopy.kernel import KernelState + if program.root_kernel.state == KernelState.INITIAL: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + codegen_results = {} + + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + codegen_results[func_id] = ( + generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.callables_table, program.target)) + if not in_knl_callable.subkernel.is_called_from_host: + assert codegen_results[func_id].host_program is None + + device_preambles = [] + for cgr in codegen_results.values(): + device_preambles.extend(cgr.device_preambles) + + # collecting the function declarations of callee kernels + for in_knl_callable in program.callables_table.values(): + for preamble in in_knl_callable.generate_preambles(program.target): + device_preambles.append(preamble) + + collective_device_program = codegen_results[program.name].device_programs[0] + callee_fdecls = [] + + for func_id, callee_cgr in codegen_results.items(): + if func_id != program.name: + assert len(callee_cgr.device_programs) == 1 + callee_prog_ast = callee_cgr.device_programs[0].ast + collective_device_program = collective_device_program.copy( + ast=Collection([callee_prog_ast, collective_device_program.ast])) + if isinstance(callee_prog_ast, Collection): + # if there is a read only constant in the kernel + for entry in callee_prog_ast.contents: + if isinstance(entry, FunctionBody): + callee_fdecls.append(entry.fdecl) + elif isinstance(callee_prog_ast, FunctionBody): + callee_fdecls.append(callee_prog_ast.fdecl) + else: + raise NotImplementedError("Do not know how to add forward" + " declarations for %r." % type(callee_prog_ast)) + + # collecting the function declarations of callee kernels + for callee_fdecl in callee_fdecls: + collective_device_program = collective_device_program.copy( + ast=Collection([callee_fdecl, collective_device_program.ast])) + + collective_device_programs = [collective_device_program] + ( + codegen_results[program.name].device_programs[1:]) + + return codegen_results[program.name].copy( + device_programs=collective_device_programs, + device_preambles=device_preambles) + + def generate_code(kernel, device=None): if device is not None: from warnings import warn diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e9de52eb68bd47aec09b0a19de0a5d5433aa9843..3bad73462598c61895f2d274c13941e433986cb4 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -90,17 +90,21 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.callables_table) + if kernel.is_called_from_host: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for callee kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index b3a87798840bb1624d350c79830f29142e54ab6c..5796f5133a1d82890c55accf28072dd5db582ee4 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -249,7 +249,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.callables_table) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c1b16deeaac98f8408d5ca82f2de1714..7950c56b3b62693f974cbcc5ab8686f30fa42cbe 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - getattr(self, "device_preambles", []) + list(getattr(self, "device_preambles", [])) ) return ( @@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code) or ( + codegen_state.kernel.is_called_from_host): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_program=None) return codegen_result diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 05b0a92050a51be1cd980648325921fbf13768d8..3516ca29a880af18936c558c2a6a457af2e3236c 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -22,7 +22,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + from loopy.diagnostic import LoopyError +from pytools import ProcessLogger def c_preprocess(source, defines=None, filename=None, include_paths=None): @@ -154,8 +158,9 @@ def parse_transformed_fortran(source, free_form=True, strict=True, :func:`parse_fortran`. * ``FILENAME``: the file name of the code being processed - The transform code must define ``RESULT``, conventionally a list of - kernels, which is returned from this function unmodified. + The transform code must define ``RESULT``, conventionally a list of kernels + or a :class:`loopy.Program`, which is returned from this function + unmodified. An example of *source* may look as follows:: @@ -236,12 +241,14 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] -def parse_fortran(source, filename="", free_form=True, strict=True, +def parse_fortran(source, filename="", free_form=None, strict=None, seq_dependencies=None, auto_dependencies=None, target=None): """ - :returns: a list of :class:`loopy.LoopKernel` objects + :returns: a :class:`loopy.Program` """ + parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) + if seq_dependencies is not None and auto_dependencies is not None: raise TypeError( "may not specify both seq_dependencies and auto_dependencies") @@ -253,6 +260,10 @@ def parse_fortran(source, filename="", free_form=True, strict=True, if seq_dependencies is None: seq_dependencies = True + if free_form is None: + free_form = True + if strict is None: + strict = True import logging console = logging.StreamHandler() @@ -273,7 +284,26 @@ def parse_fortran(source, filename="", free_form=True, strict=True, f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) - return f2loopy.make_kernels(seq_dependencies=seq_dependencies) + kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + + from loopy.kernel.tools import identify_root_kernel + from loopy.program import make_program + from loopy.transform.callable import register_callable_kernel + + root_knl_name = identify_root_kernel(kernels) + root_knl = [knl for knl in kernels if knl.name == + root_knl_name][0].copy(is_called_from_host=True) + callee_kernels = [knl for knl in kernels if knl.name != root_knl_name] + prog = make_program(root_knl) + for callee_knl in callee_kernels: + #FIXME: This would need some sort of traversal to be valid + # for all cases + # THIS IS A VERY IMPORTANT FIXME!! + prog = register_callable_kernel(prog, callee_knl) + + parse_plog.done() + + return prog # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py index ea724278f086befe7d93cdd076aca9d5955063b2..1400fb3b71416355229f11a1e6bbd74e62b4897f 100644 --- a/loopy/frontend/fortran/expression.py +++ b/loopy/frontend/fortran/expression.py @@ -44,6 +44,25 @@ _and = intern("and") _or = intern("or") +def tuple_to_complex_literal(expr): + if len(expr) != 2: + raise TranslationError("complex literals must have " + "two entries") + + r, i = expr + + r = np.array(r)[()] + i = np.array(i)[()] + + dtype = (r.dtype.type(0) + i.dtype.type(0)) + if dtype == np.float32: + dtype = np.complex64 + else: + dtype = np.complex128 + + return dtype(float(r) + float(i)*1j) + + # {{{ expression parser class FortranExpressionParser(ExpressionParserBase): @@ -178,24 +197,31 @@ class FortranExpressionParser(ExpressionParserBase): left_exp, did_something = ExpressionParserBase.parse_postfix( self, pstate, min_precedence, left_exp) - if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS: - # this must be a complex literal - if len(left_exp) != 2: - raise TranslationError("complex literals must have " - "two entries") + return left_exp, did_something - r, i = left_exp + def parse_expression(self, pstate, min_precedence=0): + left_exp = self.parse_prefix(pstate) - dtype = (r.dtype.type(0) + i.dtype.type(0)) - if dtype == np.float32: - dtype = np.complex64 - else: - dtype = np.complex128 + did_something = True + while did_something: + did_something = False + if pstate.is_at_end(): + return left_exp - left_exp = dtype(float(r) + float(i)*1j) + result = self.parse_postfix( + pstate, min_precedence, left_exp) + left_exp, did_something = result - return left_exp, did_something + from pymbolic.parser import FinalizedTuple + if isinstance(left_exp, FinalizedTuple): + # View all tuples that survive parsing as complex literals + # "FinalizedTuple" indicates that this tuple was enclosed + # in parens. + return tuple_to_complex_literal(left_exp) + + return left_exp # }}} + # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 91a5fdc88f02a99c6064f6b9944b08de662a27a8..260d1b8ecf9cdb332aa651795fb9526da681e700 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -37,12 +37,14 @@ import islpy as isl from islpy import dim_type from loopy.symbolic import IdentityMapper from loopy.diagnostic import LoopyError -from pymbolic.primitives import Wildcard +from pymbolic.primitives import (Wildcard, Slice) # {{{ subscript base shifter -class SubscriptIndexBaseShifter(IdentityMapper): +class SubscriptIndexAdjuster(IdentityMapper): + """Adjust base indices of subscripts and lengths of slices.""" + def __init__(self, scope): self.scope = scope @@ -60,21 +62,63 @@ class SubscriptIndexBaseShifter(IdentityMapper): if not isinstance(subscript, tuple): subscript = (subscript,) - subscript = list(subscript) - if len(dims) != len(subscript): raise TranslationError("inconsistent number of indices " "to '%s'" % name) + new_subscript = [] for i in range(len(dims)): if len(dims[i]) == 2: - # has a base index - subscript[i] -= dims[i][0] + # has an explicit base index + base_index, end_index = dims[i] elif len(dims[i]) == 1: - # base index is 1 implicitly - subscript[i] -= 1 + base_index = 1 + end_index, = dims[i] + + sub_i = subscript[i] + if isinstance(sub_i, Slice): + start = sub_i.start + if start is None: + start = base_index + + step = sub_i.step + if step is None: + step = 1 + + stop = sub_i.stop + if stop is None: + stop = end_index + + if step == 1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index + 1, + + step + )) + elif step == -1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index - 1, + + step + )) + + else: + # FIXME + raise NotImplementedError("Fortran slice processing for " + "non-unit strides") + + else: + sub_i = sub_i - base_index - return expr.aggregate[self.rec(tuple(subscript))] + new_subscript.append(sub_i) + + return expr.aggregate[self.rec(tuple(new_subscript))] # }}} @@ -85,9 +129,6 @@ class Scope(object): def __init__(self, subprogram_name, arg_names=set()): self.subprogram_name = subprogram_name - # map name to data - self.data_statements = {} - # map first letter to type self.implicit_types = {} @@ -98,7 +139,7 @@ class Scope(object): self.type_map = {} # map name to data - self.data = {} + self.data_map = {} self.arg_names = arg_names @@ -187,7 +228,7 @@ class Scope(object): expr = submap(expr) - subshift = SubscriptIndexBaseShifter(self) + subshift = SubscriptIndexAdjuster(self) expr = subshift(expr) return expr @@ -218,11 +259,16 @@ class F2LoopyTranslator(FTreeWalkerBase): self.block_nest = [] + def add_instruction(self, insn): + scope = self.scope_stack[-1] + + scope.previous_instruction_id = insn.id + scope.instructions.append(insn) + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] - new_id = intern("insn%d" % self.insn_id_counter) - self.insn_id_counter += 1 + new_id = self.get_insn_id() from loopy.kernel.data import Assignment insn = Assignment( @@ -233,8 +279,13 @@ class F2LoopyTranslator(FTreeWalkerBase): predicates=frozenset(self.conditions), tags=tuple(self.instruction_tags)) - scope.previous_instruction_id = new_id - scope.instructions.append(insn) + self.add_instruction(insn) + + def get_insn_id(self): + new_id = intern("insn%d" % self.insn_id_counter) + self.insn_id_counter += 1 + + return new_id # {{{ map_XXX functions @@ -328,7 +379,8 @@ class F2LoopyTranslator(FTreeWalkerBase): tp = self.dtype_from_stmt(node) - for name, shape in self.parse_dimension_specs(node, node.entity_decls): + for name, shape, initializer in self.parse_dimension_specs( + node, node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -337,6 +389,9 @@ class F2LoopyTranslator(FTreeWalkerBase): assert name not in scope.type_map scope.type_map[name] = tp + assert name not in scope.data_map + scope.data_map[name] = initializer + return [] map_Logical = map_type_decl # noqa: N815 @@ -348,7 +403,10 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_Dimension(self, node): scope = self.scope_stack[-1] - for name, shape in self.parse_dimension_specs(node, node.items): + for name, shape, initializer in self.parse_dimension_specs(node, node.items): + if initializer is not None: + raise LoopyError("initializer in dimension statement") + if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -437,7 +495,23 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("goto") def map_Call(self, node): - raise NotImplementedError("call") + scope = self.scope_stack[-1] + + new_id = self.get_insn_id() + + from pymbolic import var + + from loopy.kernel.data import CallInstruction + insn = CallInstruction( + (), var(node.designator)(*(scope.process_expression_for_loopy( + self.parse_expr(node, item)) for item in node.items)), + within_inames=frozenset( + scope.active_loopy_inames), + id=new_id, + predicates=frozenset(self.conditions), + tags=tuple(self.instruction_tags)) + + self.add_instruction(insn) def map_Return(self, node): raise NotImplementedError("return") @@ -445,11 +519,6 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_ArithmeticIf(self, node): raise NotImplementedError("arithmetic-if") - def map_If(self, node): - raise NotImplementedError("if") - # node.expr - # node.content[0] - def realize_conditional(self, node, context_cond=None): scope = self.scope_stack[-1] @@ -476,6 +545,15 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append(cond_expr) + def map_If(self, node): + self.realize_conditional(node, None) + + for c in node.content: + self.rec(c) + + self.conditions_data.pop() + self.conditions.pop() + def map_IfThen(self, node): self.block_nest.append("if") self.realize_conditional(node, None) @@ -674,6 +752,10 @@ class F2LoopyTranslator(FTreeWalkerBase): for arg_name in sub.arg_names: dims = sub.dim_map.get(arg_name) + if sub.data_map.get(arg_name) is not None: + raise NotImplementedError( + "initializer for argument %s" % arg_name) + if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( @@ -681,6 +763,7 @@ class F2LoopyTranslator(FTreeWalkerBase): arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), + is_output_only=False, )) else: kernel_data.append( @@ -699,15 +782,22 @@ class F2LoopyTranslator(FTreeWalkerBase): if sub.implicit_types is None and dtype is None: continue + kwargs = {} + if sub.data_map.get(var_name) is not None: + kwargs["read_only"] = True + kwargs["address_space"] = lp.AddressSpace.PRIVATE + kwargs["initializer"] = np.array( + sub.data_map[var_name], dtype=dtype) + kernel_data.append( lp.TemporaryVariable( var_name, dtype=dtype, - shape=sub.get_loopy_shape(var_name))) + shape=sub.get_loopy_shape(var_name), + **kwargs)) # }}} - from loopy.version import MOST_RECENT_LANGUAGE_VERSION - knl = lp.make_kernel( + knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, @@ -716,11 +806,10 @@ class F2LoopyTranslator(FTreeWalkerBase): index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) - from loopy.loop import fuse_loop_domains - knl = fuse_loop_domains(knl) + from loopy.loop import merge_loop_domains + knl = merge_loop_domains(knl) knl = lp.fold_constants(knl) result.append(knl) diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index c7389677489f1cf1c77c2a47c395badd35de230e..a124757f4729d270b0ab47c7e07cf1c436733045 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -54,7 +54,9 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" - r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?" + r"(\s*=\s*(?P.+))?" + "$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): @@ -77,7 +79,31 @@ class FTreeWalkerBase(object): else: shape = None - yield name, shape + init_str = groups["initializer"] + if init_str: + init_str = init_str.replace("(/", "[") + init_str = init_str.replace("/)", "]") + init_expr = self.parse_expr(node, init_str) + + from numbers import Number + if isinstance(init_expr, Number): + initializer = init_expr + elif isinstance(init_expr, list): + for i, item in enumerate(init_expr): + if not isinstance(item, Number): + raise LoopyError("unexpected type of " + "item %d in initializer: %s" + % (i+1, type(init_expr).__name__)) + initializer = init_expr + + else: + raise LoopyError("unexpected type of initializer: %s" + % type(init_expr).__name__) + + else: + initializer = None + + yield name, shape, initializer def __call__(self, expr, *args, **kwargs): return self.rec(expr, *args, **kwargs) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 1b2a83aad970e4f7af9027bb43a1e71ab20560ad..4d57de26b6cfa3d8932ba4f85ed02b97ddcda975 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -25,9 +25,14 @@ THE SOFTWARE. """ +import six +import numpy as np from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError +from pymbolic.mapper.evaluator import \ + EvaluationMapper as EvaluationMapperBase + +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type @@ -63,7 +68,30 @@ def dump_space(ls): # {{{ make_slab -def make_slab(space, iname, start, stop): +def make_slab(space, iname, start, stop, step=1): + """ + Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the + constraint ``start <= step*iname < stop``. + + :arg space: An instance of :class:`islpy._isl.Space`. + + :arg iname: + Either an instance of :class:`str` as a name of the ``iname`` or a + tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space. + + :arg start: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the lower bound of + ``step*iname``(inclusive). + + :arg stop: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the upper bound of + ``step*iname``. + + :arg step: + An instance of :class:`int`. + """ zero = isl.Aff.zero_on_domain(space) if isinstance(start, (isl.Aff, isl.PwAff)): @@ -92,13 +120,25 @@ def make_slab(space, iname, start, stop): iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) - result = (isl.BasicSet.universe(space) - # start <= iname - .add_constraint(isl.Constraint.inequality_from_aff( - iname_aff - start)) - # iname < stop - .add_constraint(isl.Constraint.inequality_from_aff( - stop-1 - iname_aff))) + if step > 0: + result = (isl.BasicSet.universe(space) + # start <= step*iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff - start)) + # step*iname < stop + .add_constraint(isl.Constraint.inequality_from_aff( + stop-1 - step*iname_aff))) + elif step < 0: + result = (isl.BasicSet.universe(space) + # start >= (-step)*iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff + start)) + # (-step)*iname > stop + .add_constraint(isl.Constraint.inequality_from_aff( + -stop-1 - step*iname_aff))) + else: + # step = 0 + raise LoopyError("0 step not allowed in make_slab.") return result @@ -700,4 +740,133 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): # }}} + +# {{{ subst_into_pwqpolynomial + +class QPolynomialEvaluationMapper(EvaluationMapperBase): + def __init__(self, space): + self.zero = isl.QPolynomial.zero_on_domain(space) + + context = {} + for name, (dt, pos) in six.iteritems(space.get_var_dict()): + if dt == dim_type.set: + dt = dim_type.in_ + + context[name] = isl.QPolynomial.var_on_domain(space, dt, pos) + + super(QPolynomialEvaluationMapper, self).__init__(context) + + def map_constant(self, expr): + if isinstance(expr, np.integer): + expr = int(expr) + + return self.zero + expr + + def map_quotient(self, expr): + raise TypeError("true division in '%s' not supported " + "for as-pwaff evaluation" % expr) + + +def get_param_subst_domain(new_space, base_obj, subst_dict): + """Modify the :mod:`islpy` object *base_obj* to incorporate parameters for + the keys of *subst_dict*, and rename existing parameters to include a + trailing prime. + + :arg new_space: A :class:`islpy.Space` for that contains the keys of + *subst_dict* + :arg subst_dict: A dictionary mapping parameters occurring in *base_obj* + to their values in terms of variables in *new_space* + :returns: a tuple ``(base_obj, subst_domain, subst_dict)``, where + *base_obj* is the passed *base_obj* with the space extended to cover + the new parameters in *new_space*, *subst_domain* is an + :class:`islpy.BasicSet` incorporating the constraints from *subst_dict* + and existing in the same space as *base_obj*, and *subst_dict* + is a copy of the passed *subst_dict* modified to incorporate primed + variable names in the keys. + """ + + # {{{ rename subst_dict keys and base_obj parameters to include trailing prime + + i_begin_subst_space = base_obj.dim(dim_type.param) + + new_subst_dict = {} + for i in range(i_begin_subst_space): + old_name = base_obj.space.get_dim_name(dim_type.param, i) + new_name = old_name + "'" + new_subst_dict[new_name] = subst_dict[old_name] + base_obj = base_obj.set_dim_name(dim_type.param, i, new_name) + + subst_dict = new_subst_dict + del new_subst_dict + + # }}} + + # {{{ add dimensions to base_obj + + base_obj = base_obj.add_dims(dim_type.param, new_space.dim(dim_type.param)) + for i in range(new_space.dim(dim_type.param)): + base_obj = base_obj.set_dim_name(dim_type.param, i+i_begin_subst_space, + new_space.get_dim_name(dim_type.param, i)) + + # }}} + + # {{{ build subst_domain + + subst_domain = isl.BasicSet.universe(base_obj.space).params() + + from loopy.symbolic import guarded_aff_from_expr + for i in range(i_begin_subst_space): + name = base_obj.space.get_dim_name(dim_type.param, i) + aff = guarded_aff_from_expr(subst_domain.space, subst_dict[name]) + aff = aff.set_coefficient_val(dim_type.param, i, -1) + subst_domain = subst_domain.add_constraint( + isl.Constraint.equality_from_aff(aff)) + + # }}} + + return base_obj, subst_domain, subst_dict + + +def subst_into_pwqpolynomial(new_space, poly, subst_dict): + if not poly.get_pieces(): + assert new_space.is_params() + result = isl.PwQPolynomial.zero(new_space.insert_dims(dim_type.out, 0, 1)) + assert result.dim(dim_type.out) == 1 + return result + + i_begin_subst_space = poly.dim(dim_type.param) + + poly, subst_domain, subst_dict = get_param_subst_domain( + new_space, poly, subst_dict) + + from loopy.symbolic import qpolynomial_to_expr + new_pieces = [] + for valid_set, qpoly in poly.get_pieces(): + valid_set = valid_set & subst_domain + if valid_set.plain_is_empty(): + continue + + valid_set = valid_set.project_out(dim_type.param, 0, i_begin_subst_space) + from pymbolic.mapper.substitutor import ( + SubstitutionMapper, make_subst_func) + sub_mapper = SubstitutionMapper(make_subst_func(subst_dict)) + expr = sub_mapper(qpolynomial_to_expr(qpoly)) + qpoly = QPolynomialEvaluationMapper(valid_set.space)(expr) + + new_pieces.append((valid_set, qpoly)) + + if not new_pieces: + raise ValueError("no pieces of PwQPolynomial survived the substitution") + + valid_set, qpoly = new_pieces[0] + result = isl.PwQPolynomial.alloc(valid_set, qpoly) + for valid_set, qpoly in new_pieces[1:]: + result = result.add_disjoint( + isl.PwQPolynomial.alloc(valid_set, qpoly)) + + assert result.dim(dim_type.out) + return result + +# }}} + # vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index c3cd1738d7160c7feb6ef5d1042e3d41e19cdfdb..0ce06a126ef435f99b32c89bcd576beba648a3bb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,11 +37,8 @@ import re from pytools import UniqueNameGenerator, generate_unique_names, natsorted -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -161,8 +158,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: domains - a list of :class:`islpy.BasicSet` instances - representing the :ref:`domain-tree`. + a list of :class:`islpy.BasicSet` instances representing the + :ref:`domain-tree`. May be empty. .. attribute:: instructions @@ -238,6 +235,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: target A subclass of :class:`loopy.TargetBase`. + + .. attribute:: is_called_from_host + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from other top level kernels. Default value is + *True*. + """ # {{{ constructor @@ -262,11 +265,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): applied_iname_rewrites=None, cache_manager=None, - index_dtype=np.int32, + index_dtype=None, options=None, state=KernelState.INITIAL, target=None, + is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): @@ -293,15 +297,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -313,12 +309,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager cache_manager = SetOperationCacheManager() + if index_dtype is None: + index_dtype = np.int32 # }}} # {{{ process assumptions - if assumptions is None: + if assumptions is None and domains: dom0_space = domains[0].get_space() assumptions_space = isl.Space.params_alloc( dom0_space.get_ctx(), dom0_space.dim(dim_type.param)) @@ -328,6 +326,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): dom0_space.get_dim_name(dim_type.param, i)) assumptions = isl.BasicSet.universe(assumptions_space) + elif assumptions is None and not domains: + assumptions = isl.BasicSet.read_from_str( + isl.DEFAULT_CONTEXT, "[] -> { : 1 = 1}") + elif isinstance(assumptions, str): assumptions_set_str = "[%s] -> { : %s}" \ % (",".join(s for s in self.outer_params(domains)), @@ -405,6 +407,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -413,7 +416,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -531,6 +534,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): except KeyError: pass + if name in self.all_inames(): + from loopy import TemporaryVariable + return TemporaryVariable( + name=name, + dtype=self.index_dtype, + shape=()) + + try: + dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(), + name) + from loopy import ValueArg + return ValueArg(name, dtype) + except TypeError: + pass + raise ValueError("nothing known about variable '%s'" % name) @property @@ -644,7 +662,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): for dom in self.domains: return dom.get_ctx() - assert False + return isl.DEFAULT_CONTEXT @memoize_method def combine_domains(self, domains): @@ -1016,7 +1034,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return ( set( arg.name for arg in self.args - if isinstance(arg, ArrayArg)) + if isinstance(arg, ArrayArg) + and arg.address_space == AddressSpace.GLOBAL) | set( tv.name for tv in six.itervalues(self.temporary_variables) @@ -1073,20 +1092,26 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) + # {{{ collecting the callee kernels in insn_ids + + from loopy.kernel.tools import get_direct_callee_kernels + callee_kernels = get_direct_callee_kernels(self, + callables_table, insn_ids) + + # }}} all_inames_by_insns = set() for insn_id in insn_ids: @@ -1101,6 +1126,16 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} + # updating the grid sizes from the callee_kernels. + for callee_kernel in callee_kernels: + gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id for insn in callee_kernel.instructions), + callables_table, ignore_auto) + + # FIXME: Should assert that nothing is being overwritten + global_sizes.update(gsize) + local_sizes.update(lsize) + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1141,6 +1176,33 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size + return global_sizes, local_sizes + + @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + + assert self.is_called_from_host, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, callables_table, ignore_auto=ignore_auto) + def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1170,18 +1232,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are :mod:`pymbolic` expressions """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, callables_table, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1189,7 +1253,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1197,17 +1261,22 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, callables_table, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` + *global_size* and *local_size* are :mod:`pymbolic` expressions """ return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), + callables_table, ignore_auto=ignore_auto) # }}} @@ -1398,36 +1467,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ handle linearization variable that doesn't yet exist @property @@ -1439,14 +1478,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.program import make_program + program = make_program(self) + return program(*args, **kwargs) # }}} @@ -1528,6 +1564,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", + "is_called_from_host", "target", ) @@ -1547,14 +1584,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash def __hash__(self): from loopy.tools import LoopyKeyBuilder diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 84b0a4a74bdc0e4dc32b5823c50a4a9c0a8d9f25..2e43d9b605c313add3d353da38b138f9d57bb9b7 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -835,6 +835,7 @@ class ArrayBase(ImmutableRecord): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, **kwargs) def __eq__(self, other): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 9a90462768684152aec96319199fea0f2fa63435..1f896bb973c2ebc17294ffd5031998c6af962ca6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,16 +24,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript, Call from loopy.tools import intern_frozenset_of_ids, Optional -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace, ValueArg) +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -241,13 +244,11 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): if arrow_idx >= 0: result["inames_to_dup"] = ( result.get("inames_to_dup", []) - + - [(value[:arrow_idx], value[arrow_idx+2:])]) + + [(value[:arrow_idx], value[arrow_idx+2:])]) else: result["inames_to_dup"] = ( result.get("inames_to_dup", []) - + - [(value, None)]) + + [(value, None)]) elif opt_key == "dep" and opt_value is not None: if opt_value.startswith("*"): @@ -500,9 +501,11 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) @@ -1134,8 +1137,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() import loopy as lp - - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg if arg_name in self.all_params: return ValueArg(arg_name) @@ -1657,7 +1659,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions_in_single_kernel from loopy.match import MatchExpressionBase new_deps = [] @@ -1666,7 +1668,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions(knl, dep): + for new_dep in find_instructions_in_single_kernel(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True @@ -1819,6 +1821,206 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ slice to sub array ref + +def get_slice_params(slice, dimension_length): + """ + Returns the slice parameters across an axes spanning *domain_length* as a + tuple of ``(start, stop, step)``. + + :arg slice: An instance of :class:`pymbolic.primitives.Slice`. + :arg dimension_length: The axes length swept by *slice*. + """ + from pymbolic.primitives import Slice + assert isinstance(slice, Slice) + start, stop, step = slice.start, slice.stop, slice.step + + # {{{ defaulting parameters + + if step is None: + step = 1 + + if step == 0: + raise LoopyError("Slice cannot have 0 step size.") + + if start is None: + if step > 0: + start = 0 + else: + start = dimension_length-1 + + if stop is None: + if step > 0: + stop = dimension_length + else: + stop = -1 + + # }}} + + return start, stop, step + + +class SliceToInameReplacer(IdentityMapper): + """ + Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. + + .. attribute:: var_name_gen + + Variable name generator, in order to generate unique inames within the + kernel domain. + + .. attribute:: knl + + An instance of :class:`loopy.LoopKernel` + + .. attribute:: subarray_ref_bounds + + A :class:`list` (one entry for each :class:`SubArrayRef` to be created) + of :class:`dict` instances to store the slices enountered in the + expressions as a mapping from ``iname`` to a tuple of ``(start, stop, + step)``, which describes the boxy (i.e. affine) constraints imposed on + the ``iname`` by the corresponding slice notation its intended to + replace. + """ + def __init__(self, knl, var_name_gen): + self.var_name_gen = var_name_gen + self.knl = knl + + self.subarray_ref_bounds = [] + + def map_subscript(self, expr): + subscript_iname_bounds = {} + self.subarray_ref_bounds.append(subscript_iname_bounds) + + new_index = [] + swept_inames = [] + for i, index in enumerate(expr.index_tuple): + if isinstance(index, Slice): + unique_var_name = self.var_name_gen(based_on="i") + if expr.aggregate.name in self.knl.arg_dict: + domain_length = self.knl.arg_dict[expr.aggregate.name].shape[i] + elif expr.aggregate.name in self.knl.temporary_variables: + domain_length = self.knl.temporary_variables[ + expr.aggregate.name].shape[i] + else: + raise LoopyError("Slice notation is only supported for " + "variables whose shapes are known at creation time " + "-- maybe add the shape for the sliced argument.") + start, stop, step = get_slice_params( + index, domain_length) + subscript_iname_bounds[unique_var_name] = (start, stop, step) + + new_index.append(start+step*Variable(unique_var_name)) + + swept_inames.append(Variable(unique_var_name)) + else: + new_index.append(index) + + if swept_inames: + return SubArrayRef(tuple(swept_inames), Subscript( + self.rec(expr.aggregate), + self.rec(tuple(new_index)))) + else: + return IdentityMapper.map_subscript(self, expr) + + def map_call(self, expr): + def _convert_array_to_slices(arg): + # FIXME: We do not support something like A[1] should point to the + # second row if 'A' is 3 x 3 array. + if isinstance(arg, Variable): + from loopy.kernel.data import auto + if (arg.name in self.knl.temporary_variables): + if self.knl.temporary_variables[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.temporary_variables[arg.name].shape) + elif arg.name in self.knl.arg_dict: + if isinstance(self.knl.arg_dict[arg.name], ValueArg): + array_arg_shape = () + else: + array_arg_shape = self.knl.arg_dict[arg.name].shape + else: + assert arg.name in self.knl.all_inames() + array_arg_shape = () + + if array_arg_shape != (): + return Subscript(arg, tuple(Slice(()) for _ in + array_arg_shape)) + return arg + + return Call(expr.function, + tuple(self.rec(_convert_array_to_slices(par)) for par in + expr.parameters)) + + # FIXME: Missing map_call_with_kwargs + + def get_iname_domain_as_isl_set(self): + """ + Returns the extra domain constraints imposed by the slice inames, + recorded in :attr:`iname_domains`. + """ + subarray_ref_domains = [] + for sar_bounds in self.subarray_ref_bounds: + ctx = self.knl.isl_context + space = isl.Space.create_from_names(ctx, + set=list(sar_bounds.keys())) + from loopy.symbolic import DependencyMapper + args_as_params_for_domains = set() + for _, (start, stop, step) in sar_bounds.items(): + args_as_params_for_domains |= DependencyMapper()(start) + args_as_params_for_domains |= DependencyMapper()(stop) + args_as_params_for_domains |= DependencyMapper()(step) + + space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) + for i, arg in enumerate(args_as_params_for_domains): + space = space.set_dim_id(dim_type.param, i, isl.Id(arg.name)) + + iname_set = isl.BasicSet.universe(space) + + from loopy.isl_helpers import make_slab + for iname, (start, stop, step) in sar_bounds.items(): + iname_set = iname_set & make_slab(space, iname, start, stop, step) + + subarray_ref_domains.append(iname_set) + + return subarray_ref_domains + + +def realize_slices_array_inputs_as_sub_array_refs(kernel): + """ + Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` + encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. + """ + unique_var_name_generator = kernel.get_var_name_generator() + slice_replacer = SliceToInameReplacer(kernel, unique_var_name_generator) + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + new_expr = slice_replacer(insn.expression) + new_assignees = tuple(slice_replacer(assignee) for assignee in + insn.assignees) + new_insns.append(insn.copy(assignees=new_assignees, + expression=new_expr)) + elif isinstance(insn, (CInstruction, MultiAssignmentBase, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("Unknown type of instruction -- %s" % + type(insn)) + + return kernel.copy( + domains=( + kernel.domains + + slice_replacer.get_iname_domain_as_isl_set()), + instructions=new_insns) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -1947,6 +2149,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) + is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -1972,55 +2175,56 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - from loopy.version import LANGUAGE_VERSION_SYMBOLS + if not is_callee_kernel: + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2121,6 +2325,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) + + # convert slices to iname domains + knl = realize_slices_array_inputs_as_sub_array_refs(knl) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- @@ -2158,15 +2366,29 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_args_are_output_only + knl = infer_args_are_output_only(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + if is_callee_kernel: + return knl + else: + from loopy.program import make_program + return make_program(knl) + + +def make_function(*args, **kwargs): + lang_version = kwargs.pop('lang_version', None) + if lang_version: + raise LoopyError("lang_version should be set for program, not " + "functions.") - return knl + kwargs['is_callee_kernel'] = True + return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9e6e8db666bab61e85981dc697c2d40cea6a18a6..434ee3884b6ad42cf7c5d0a690850c203e0d17ef 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -338,6 +338,7 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) ImmutableRecord.__init__(self, **kwargs) @@ -363,7 +364,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -411,6 +412,9 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 + # Constant Arg cannot be an output + is_output_only = False + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..0cb610074116efcc0047e6301472b6c7782f6954 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,969 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from six.moves import zip + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel +from loopy.kernel.data import ValueArg, ArrayArg +from loopy.symbolic import (SubstitutionMapper, DependencyMapper) +from pymbolic.primitives import Variable + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable +.. autoclass:: ManglerCallable + +""" + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + def map_expr(self, subst_mapper): + return self.copy() + + def depends_on(self): + return frozenset() + + update_persistent_hash = update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + """ + + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import FixedStrideArrayDimTag + + assert isinstance(shape, tuple) + assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) + + # }}} + + super(ArrayArgDescriptor, self).__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + hash_fields = ( + "shape", + "address_space", + "dim_tags") + + def map_expr(self, subst_mapper): + new_shape = tuple(subst_mapper(axis_len) for axis_len in self.shape) + new_dim_tags = tuple(dim_tag.map_expr(subst_mapper) for dim_tag in + self.dim_tags) + return self.copy(shape=new_shape, dim_tags=new_dim_tags) + + def depends_on(self): + result = DependencyMapper(composite_leaves=False)(self.shape) | ( + DependencyMapper(composite_leaves=False)(tuple(dim_tag.stride for + dim_tag in self.dim_tags))) + return frozenset(var.name for var in result) + + # FIXME ArrayArgDescriptor should never need to be persisted, remove + # this method when that is so. + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_pymbolic_expression(key_hash, self.shape) + key_builder.rec(key_hash, self.address_space) + key_builder.rec(key_hash, self.dim_tags) + + +def get_arg_descriptor_for_expression(kernel, expr): + """ + :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` + describing the argument expression *expr* which occurs + in a call in the code of *kernel*. + """ + from pymbolic.primitives import Variable + from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, + SweptInameStrideCollector) + from loopy.kernel.data import TemporaryVariable, ArrayArg + + if isinstance(expr, SubArrayRef): + name = expr.subscript.aggregate.name + arg = kernel.get_var_descriptor(name) + + if not isinstance(arg, (TemporaryVariable, ArrayArg)): + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + aspace = arg.address_space + + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + sub_dim_tags = [] + sub_shape = [] + + # FIXME This blindly assumes that dim_tag has a stride and + # will not work for non-stride dim tags (e.g. vec or sep). + + # (AK) FIXME: This will almost always be nonlinear--when does this + # actually help? Maybe the + # (KK) Reply: This helps in identifying identities like + # "2*(i//2) + i%2" := "i" + # See the kernel in + # test_callables.py::test_shape_translation_through_sub_array_refs + + from loopy.symbolic import simplify_using_aff + linearized_index = simplify_using_aff( + kernel, + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple))) + + strides_as_dict = SweptInameStrideCollector( + tuple(iname.name for iname in expr.swept_inames) + )(linearized_index) + sub_dim_tags = tuple( + DimTag(strides_as_dict[iname]) for iname in expr.swept_inames) + sub_shape = tuple( + pw_aff_to_expr( + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff + - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1 + for iname in expr.swept_inames) + if expr.swept_inames == (): + sub_shape = (1, ) + sub_dim_tags = (DimTag(1),) + + return ArrayArgDescriptor( + address_space=aspace, + dim_tags=sub_dim_tags, + shape=sub_shape) + + elif isinstance(expr, Variable): + arg = kernel.get_var_descriptor(expr.name) + from loopy.kernel.array import ArrayBase + + if isinstance(arg, ValueArg) or (isinstance(arg, ArrayBase) + and arg.shape == ()): + return ValueArgDescriptor() + elif isinstance(arg, (ArrayArg, TemporaryVariable)): + raise LoopyError("may not pass entire array " + "'%s' in call statement in kernel '%s'" + % (expr.name, kernel.name)) + else: + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + else: + return ValueArgDescriptor() + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if arg.name in kernel.get_written_variables(): + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + if arg.name in kernel.get_read_variables(): + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + if not (arg.name in kernel.get_read_variables() or arg.name in + kernel.get_written_variables()): + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer to + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. note:: + + This class acts as a pseudo-callable and its significance lies in + solving picklability issues. + """ + fields = set(["local_size", "global_size"]) + + def __init__(self, global_size, local_size): + self.global_size = global_size + self.local_size = local_size + + def __call__(self, insn_ids, callables_table, ignore_auto=True): + return self.global_size, self.local_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within expressions in + a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types of the + callable. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. + + .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. + + - Negative "arg_id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + update_persistent_hash = update_persistent_hash + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, global_size, local_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the program in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ + Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstract interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the function and is expected to be supplemented in the + derived subclasses. + """ + + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + hash_fields = fields + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(ScalarCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table, ()) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + + :returns: A tuple of the call to be generated and an instance of + :class:`bool` whether the first assignee is a part of the LHS in + the assignment instruction. + + .. note:: + + The default implementation returns the first assignees and the + references of the rest of the assignees are appended to the + arguments of the call. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + """ + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + first_assignee_is_returned = len(insn.assignees) > 0 + + # TODO: Maybe this interface a bit confusing. Should we allow this + # method to directly return a cgen.Assign or cgen.ExpressionStatement? + + return var(self.name_in_target)(*c_parameters), first_assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields = fields + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + + super(CallableKernel, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def name(self): + return self.subkernel.name + + def with_types(self, arg_id_to_dtype, caller_kernel, + callables_table): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel, callables_table = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + callables_table, + expect_completion=True)) + + new_arg_id_to_dtype = {} + for pos, kw in pos_to_kw.items(): + new_arg_id_to_dtype[kw] = specialized_kernel.arg_dict[kw].dtype + new_arg_id_to_dtype[pos] = specialized_kernel.arg_dict[kw].dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), callables_table + + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + # tune the subkernel so that we have the matching shapes and + # dim_tags + + # {{{ map the arg_descrs so that all the variables are from the callees + # perspective + + domain_dependent_vars = frozenset().union( + *(frozenset(dom.get_var_names(1)) for dom in + self.subkernel.domains)) + + # FIXME: This is ill-formed, because par can be an expression, e.g. + # 2*i+2 or 2*(i+1). A key feature of expression is that structural + # equality and semantic equality are not the same, so even if the + # SubstitutionMapper allowed non-variables, it would have to solve the + # (considerable) problem of expression equivalence. + + import numbers + substs = {} + assumptions = {} + for arg, par in zip(self.subkernel.args, expr.parameters): + if isinstance(arg, ValueArg) and arg.name in domain_dependent_vars: + if isinstance(par, Variable): + if par in substs: + assumptions[arg.name] = substs[par].name + else: + substs[par] = Variable(arg.name) + elif isinstance(par, numbers.Number): + assumptions[arg.name] = par + + def subst_func(expr): + if expr in substs: + return substs[expr] + else: + return expr + + subst_mapper = SubstitutionMapper(subst_func) + + arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for + arg_id, descr in arg_id_to_descr.items()) + + # }}} + + dependents = frozenset().union(*(descr.depends_on() for descr in + arg_id_to_descr.values())) + unknown_deps = dependents - self.subkernel.all_variable_names() + # FIXME: Need to make sure that we make the name of the variables + # unique, and then run a subst_mapper + + new_args = self.subkernel.args[:] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) + + if isinstance(descr, ArrayArgDescriptor): + if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): + raise LoopyError("Array passed to scalar argument " + "'%s' of the function '%s' (in '%s')." % ( + arg_id, self.subkernel.name, + caller_kernel.name)) + if self.subkernel.arg_dict[arg_id].shape and ( + len(self.subkernel.arg_dict[arg_id].shape) != + len(descr.shape)): + raise LoopyError("Dimension mismatch for argument " + " '%s' of the function '%s' (in '%s')." % ( + arg_id, self.subkernel.name, + caller_kernel.name)) + + new_arg = self.subkernel.arg_dict[arg_id].copy( + shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + # replacing the new arg with the arg of the same name + new_args = [new_arg if arg.name == arg_id else arg for arg in + new_args] + elif isinstance(descr, ValueArgDescriptor): + if not isinstance(self.subkernel.arg_dict[arg_id], ValueArg): + raise LoopyError("Scalar passed to array argument " + "'%s' of the callable '%s' (in '%s')" % ( + arg_id, self.subkernel.name, + caller_kernel.name)) + else: + raise LoopyError("Descriptor must be either an instance of " + "ArrayArgDescriptor or ValueArgDescriptor -- got %s" % + type(descr)) + + descriptor_specialized_knl = self.subkernel.copy(args=new_args) + # add the variables on which the strides/shapes depend but not provided + # as arguments + args_added_knl = descriptor_specialized_knl.copy( + args=descriptor_specialized_knl.args + + [ValueArg(dep) for dep in unknown_deps]) + from loopy.preprocess import traverse_to_infer_arg_descr + from loopy.transform.parameter import assume + args_added_knl, callables_table = ( + traverse_to_infer_arg_descr(args_added_knl, + callables_table)) + + if assumptions: + args_added_knl = assume(args_added_knl, ' and '.join([ + '{0}={1}'.format(key, val) for key, val in assumptions.items()])) + + return ( + self.copy( + subkernel=args_added_knl, + arg_id_to_descr=arg_id_to_descr), + callables_table, tuple(Variable(dep) for dep in unknown_deps)) + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, gsize, lsize): + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(gsize, lsize)))) + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + # FIXME Check that this is correct. + + return + yield + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # insert the assignees at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + return var(self.subkernel.name)(*c_parameters), False + +# }}} + + +# {{{ mangler callable + +class ManglerCallable(ScalarCallable): + """ + A callable whose characteristic is defined by a function mangler. + + .. attribute:: function_mangler + + A function of signature ``(kernel, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for arg_id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return ( + self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name)) + + def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel, self.name, arg_dtypes) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index bcb6faa3fabaebfd89fa46fff44d137915b1e4bb..3d2f16bb735df860b72f67dd09067f525ba1a19c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from six.moves import intern from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError @@ -499,7 +500,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -518,13 +519,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -535,6 +543,8 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -960,12 +970,12 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=f(self.assignee, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1071,9 +1081,10 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1089,9 +1100,10 @@ class CallInstruction(MultiAssignmentBase): expression = parse(expression) from pymbolic.primitives import Variable, Subscript - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef for assignee in assignees: - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, + SubArrayRef)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees @@ -1115,12 +1127,12 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=f(self.assignees, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1137,6 +1149,22 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result + def arg_id_to_val(self): + """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers + for positional arguments, strings for keyword args, and negative numbers + for assignees) to their respective values + """ + + from pymbolic.primitives import CallWithKwargs + arg_id_to_val = dict(enumerate(self.expression.parameters)) + if isinstance(self.expression, CallWithKwargs): + for kw, val in six.iteritems(self.expression.kw_parameters): + arg_id_to_val[kw] = val + for i, arg in enumerate(self.assignees): + arg_id_to_val[-i-1] = arg + + return arg_id_to_val + @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment @@ -1147,34 +1175,118 @@ class CallInstruction(MultiAssignmentBase): # }}} +def subscript_contains_slice(subscript): + """Return *True* if the *subscript* contains an instance of + :class:`pymbolic.primitives.Slice` as of its indices. + """ + from pymbolic.primitives import Subscript, Slice + assert isinstance(subscript, Subscript) + return any(isinstance(index, Slice) for index in subscript.index_tuple) + + +def is_array_call(assignees, expression): + """ + Returns *True* is the instruction is an array call. + + An array call is a function call applied to array type objects. If any of + the arguemnts or assignees to the function is an array, + :meth:`is_array_call` will return *True*. + """ + from pymbolic.primitives import Call, CallWithKwargs, Subscript + from loopy.symbolic import SubArrayRef + + if not isinstance(expression, (Call, CallWithKwargs)): + return False + + for par in expression.parameters+assignees: + if isinstance(par, SubArrayRef): + return True + elif isinstance(par, Subscript): + if subscript_contains_slice(par): + return True + + # did not encounter SubArrayRef/Slice, hence must be a normal call + return False + + +def modify_assignee_assignee_for_array_call(assignee): + """ + Converts the assignee subscript or variable as a SubArrayRef. + """ + from pymbolic.primitives import Subscript, Variable + from loopy.symbolic import SubArrayRef + if isinstance(assignee, SubArrayRef): + return assignee + elif isinstance(assignee, Subscript): + if subscript_contains_slice(assignee): + # Slice subscripted array are treated as SubArrayRef in the kernel + # Hence, making the behavior similar to that of `SubArrayref` + return assignee + else: + return SubArrayRef((), assignee) + elif isinstance(assignee, Variable): + return SubArrayRef((), Subscript(assignee, 0)) + else: + raise LoopyError("ArrayCall only takes Variable, Subscript or " + "SubArrayRef as its inputs") + + def make_assignment(assignees, expression, temp_var_types=None, **kwargs): + if temp_var_types is None: temp_var_types = (Optional(),) * len(assignees) - if len(assignees) == 1: + if len(assignees) != 1 or is_array_call(assignees, expression): + atomicity = kwargs.pop("atomicity", ()) + if atomicity: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, CallWithKwargs, Reduction)): + raise LoopyError("right-hand side in multiple assignment must be " + "function call or reduction, got: '%s'" % expression) + + if not is_array_call(assignees, expression): + return CallInstruction( + assignees=assignees, + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + # In the case of an array call, it is important to have each + # assignee as an instance of SubArrayRef. If not given as a + # SubArrayRef + return CallInstruction( + assignees=tuple(modify_assignee_assignee_for_array_call( + assignee) for assignee in assignees), + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + def _is_array(expr): + from loopy.symbolic import SubArrayRef + from pymbolic.primitives import (Subscript, Slice) + if isinstance(expr, SubArrayRef): + return True + if isinstance(expr, Subscript): + return any(isinstance(idx, Slice) for idx in + expr.index_tuple) + return False + + from loopy.symbolic import DependencyMapper + if any(_is_array(dep) for dep in DependencyMapper()((assignees, + expression))): + raise LoopyError("Array calls only supported as instructions" + " with function call as RHS for now.") + return Assignment( assignee=assignees[0], expression=expression, temp_var_type=temp_var_types[0], **kwargs) - atomicity = kwargs.pop("atomicity", ()) - if atomicity: - raise LoopyError("atomic operations with more than one " - "left-hand side not supported") - - from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): - raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) - - return CallInstruction( - assignees=assignees, - expression=expression, - temp_var_types=temp_var_types, - **kwargs) - # {{{ c instruction diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index bb6ae44c9bf8daefef5f6564fccbec58ba72a708..b2c054a37367a74e9a60f9b8a4fbc6cf82b14a9c 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -35,26 +35,37 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted - +from loopy.symbolic import CombineMapper +from loopy.kernel import LoopKernel +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.instruction import (MultiAssignmentBase, + _DataObliviousInstruction) +from functools import reduce import logging logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(knl, dtype_dict): +def add_dtypes(program, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) + root_kernel = program.root_kernel + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + root_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) + root_kernel - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + root_kernel_with_added_dtypes = ( + root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) + + return program.with_root_kernel(root_kernel_with_added_dtypes) def _add_dtypes_overdetermined(knl, dtype_dict): @@ -106,7 +117,8 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): + assert isinstance(prog, Program) processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -115,10 +127,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - knl = add_dtypes(knl, processed_dtype_dict) + prog = add_dtypes(prog, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): @@ -454,7 +466,9 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): +@iterate_over_kernels_if_given_program +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -466,7 +480,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn @@ -747,7 +761,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -761,7 +775,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -789,6 +803,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), + callables_table, axis=recursion_axis) if axis is None: @@ -828,7 +843,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -839,6 +855,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -860,7 +877,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -928,7 +945,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} @@ -1852,9 +1870,59 @@ def find_aliasing_equivalence_classes(kernel): # }}} +# {{{ callee kernel tools + +def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): + """ + Returns an instance of :class:`frozenset` of all the callee kernels + called in instructions in the *kernel* whose IDs are given in *insn_ids*. + + :arg kernel: An instance of :class:`LoopKernel`. + :arg insn_ids: An instance of :class:`frozenset`. + + If *insn_ids* is *None* returns all the callee kernels called by *kernel*. + """ + #FIXME: explain what "direct" means + + if insn_ids is None: + insn_ids = frozenset(insn.id for insn in kernel.instructions) + + from loopy.kernel.function_interface import CallableKernel + + def _get_callee_kernel_if_insn_has_callable_kernel(insn_id): + """Returns callee kernel if the instruction has a call to a + :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise + returns *None*. + """ + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import (CallInstruction, + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) + from pymbolic.primitives import Call + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in callables_table): + in_knl_callable = callables_table[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + return in_knl_callable.subkernel + elif isinstance(insn, (MultiAssignmentBase, + CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % + type(insn)) + + return None + + return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(insn_id) + for insn_id in insn_ids]) - frozenset([None]) + +# }}} + + # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_output_only(kernel): """ Returns a copy of *kernel* with the attribute ``is_output_only`` set. @@ -1866,6 +1934,7 @@ def infer_arg_is_output_only(kernel): """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, ArrayArg): if arg.is_output_only is not None: @@ -1885,4 +1954,61 @@ def infer_arg_is_output_only(kernel): # }}} + +# {{{ identify_root_kernel + +class CallCollector(CombineMapper): + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def identify_root_kernel(kernels): + assert isinstance(kernels, list) + assert all(isinstance(knl, LoopKernel) for knl in kernels) + call_collector = CallCollector() + + def _calls_in_a_kernel(knl): + calls = set() + for insn in knl.instructions: + if isinstance(insn, MultiAssignmentBase): + calls = calls | call_collector(insn.expression) + elif isinstance(insn, _DataObliviousInstruction): + pass + else: + raise NotImplementedError() + + return calls + + all_calls = frozenset().union(*[_calls_in_a_kernel(knl) for knl in + kernels]) + + kernel_names = frozenset([knl.name for knl in kernels]) + + assert len(kernel_names - all_calls) == 1 + + root_knl_name, = (kernel_names - all_calls) + return root_knl_name + +# }}} + # vim: foldmethod=marker diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9fe5c4c040608dc181b96daa812405a65..378b7de5897912e2e04314b066f40e5ea6b0c785 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,38 +22,107 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result - - return None - - -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - - return None - - -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - return None +from loopy.kernel.function_interface import ScalarCallable +from loopy.diagnostic import LoopyError + + +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), callables_table) + + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + callables_table, ()) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + new_arg_id_to_dtype = dict((i, dtype) for i, dtype in + arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype[-1] = kernel.index_dtype + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) + + def emit_call(self, expression_to_code_mapper, expression, target): + from pymbolic.primitives import Subscript + + if len(expression.parameters) != 1: + raise LoopyError("%s takes exactly one argument" % self.name) + arg, = expression.parameters + if not isinstance(arg, Subscript): + raise LoopyError( + "argument to %s must be a subscript" % self.name) + + ary = expression_to_code_mapper.find_array(arg) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + access_info = get_access_info(expression_to_code_mapper.kernel.target, + ary, arg.index, lambda expr: evaluate(expr, + expression_to_code_mapper.codegen_state.var_subst_map), + expression_to_code_mapper.codegen_state.vectorization_info) + + from loopy.kernel.data import ImageArg + if isinstance(ary, ImageArg): + raise LoopyError("%s does not support images" % self.name) + + if self.name == "indexof": + return access_info.subscripts[0] + elif self.name == "indexof_vec": + from loopy.kernel.array import VectorArrayDimTag + ivec = None + for iaxis, dim_tag in enumerate(ary.dim_tags): + if isinstance(dim_tag, VectorArrayDimTag): + ivec = iaxis + + if ivec is None: + return access_info.subscripts[0] + else: + return ( + access_info.subscripts[0]*ary.shape[ivec] + + access_info.vector_index) + + else: + raise RuntimeError("should not get here") + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + return self.emit_call( + expression_to_code_mapper, + insn.expression, + target), True + + +def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` for the *idenitifer* + which is not present in *target*, but whose interface is given by + :mod:`loo.py`. Callables that fall in this category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import ( + reduction_func_id_to_in_knl_callable_mapper) + return reduction_func_id_to_in_knl_callable_mapper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114ddeb9d48eb33a765755302917ca27f63..e59a892bb4c7b3bd7222bf61b29e0ade92195240 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,77 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + callables_table) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + callables_table) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_id_to_in_knl_callable_mapper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 53d05a28e7245e381be769af12d6066ffb486541..5b78c08f4d3588123a6eaf1d6dccda239ef6fed7 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,11 +24,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.tools import update_persistent_hash class ReductionOperation(object): @@ -81,6 +84,9 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) + def get_scalar_callables(self): + return frozenset() + class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -180,7 +186,10 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ResolvedFunction("max")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["max"]) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +197,10 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ResolvedFunction("min")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["min"]) # {{{ base class for symbolic reduction ops @@ -212,6 +224,10 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = update_persistent_hash # }}} @@ -245,7 +261,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -262,7 +278,10 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset(["make_tuple", SegmentedOp(self)]) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -270,34 +289,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = update_persistent_hash # }}} @@ -330,7 +339,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -347,7 +356,10 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset([self.which, "make_tuple", ArgExtOp(self)]) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -355,43 +367,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = update_persistent_hash # }}} @@ -446,70 +438,97 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), callables_table + + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table, ()) + + def generate_preambles(self, target): + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_func_id_to_in_knl_callable_mapper(target, identifier): + if isinstance(identifier, ReductionOpFunction): + from loopy.target.c import CFamilyTarget + if not isinstance(target, CFamilyTarget): + raise LoopyError("%s: only C-like targets supported for now" % + identifier) + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 4592463822a2321745aaf48a316d16c98d4efca3..24cbe730f7679ba9b9931f7493d3c793ce3718c9 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -25,13 +25,15 @@ THE SOFTWARE. import islpy as isl import six +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that *could* be nested around them. - :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.find_loop_nest_around_map` """ result = {} @@ -55,7 +57,9 @@ def potential_loop_nest_map(kernel): return result -def fuse_loop_domains(kernel): +@iterate_over_kernels_if_given_program +def merge_loop_domains(kernel): + # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames while True: @@ -63,11 +67,13 @@ def fuse_loop_domains(kernel): parents_per_domain = kernel.parents_per_domain() all_parents_per_domain = kernel.all_parents_per_domain() + iname_to_insns = kernel.iname_to_insns() + new_domains = None for inner_iname, outer_inames in six.iteritems(lnm): for outer_iname in outer_inames: - # {{{ check if it's safe to fuse + # {{{ check if it's safe to merge inner_domain_idx = kernel.get_home_domain_index(inner_iname) outer_domain_idx = kernel.get_home_domain_index(outer_iname) @@ -75,12 +81,28 @@ def fuse_loop_domains(kernel): if inner_domain_idx == outer_domain_idx: break + if (not iname_to_insns[inner_iname] + or not iname_to_insns[outer_iname]): + # Inames without instructions occur when used in + # a SubArrayRef. We don't want monster SubArrayRef domains, + # so refuse to merge those. + continue + + if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]: + # The two inames are imperfectly nested. Domain fusion + # might be invalid when the inner loop is empty, leading to + # the outer loop also being empty. + + # FIXME: Not fully correct, does not consider reductions + # https://gitlab.tiker.net/inducer/loopy/issues/172 + continue + if ( outer_domain_idx in all_parents_per_domain[inner_domain_idx] and not outer_domain_idx == parents_per_domain[inner_domain_idx]): # Outer domain is not a direct parent of the inner - # domain. Unable to fuse. + # domain. Unable to merge. continue outer_dom = kernel.domains[outer_domain_idx] @@ -90,7 +112,7 @@ def fuse_loop_domains(kernel): if is_domain_dependent_on_inames(kernel, inner_domain_idx, outer_inames): # Bounds of inner domain depend on outer domain. - # Unable to fuse. + # Unable to merge. continue # }}} diff --git a/loopy/match.py b/loopy/match.py index 3c047e463939cd67a4878d202a754c0cab48058d..9766fac2b57f5cb55eebb09f4ab32880ef3c2038 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -49,6 +49,7 @@ Match expressions .. autoclass:: Tagged .. autoclass:: Writes .. autoclass:: Reads +.. autoclass:: InKernel .. autoclass:: Iname """ @@ -73,6 +74,7 @@ _id = intern("_id") _tag = intern("_tag") _writes = intern("_writes") _reads = intern("_reads") +_in_kernel = intern("_in_kernel") _iname = intern("_iname") _whitespace = intern("_whitespace") @@ -92,13 +94,14 @@ _LEX_TABLE = [ (_tag, RE(r"tag:([\w?*]+)")), (_writes, RE(r"writes:([\w?*]+)")), (_reads, RE(r"reads:([\w?*]+)")), + (_in_kernel, RE(r"in_kernel:([\w?*]+)")), (_iname, RE(r"iname:([\w?*]+)")), (_whitespace, RE("[ \t]+")), ] -_TERMINALS = ([_id, _tag, _writes, _reads, _iname]) +_TERMINALS = ([_id, _tag, _writes, _reads, _in_kernel, _iname]) # {{{ operator precedence @@ -262,6 +265,11 @@ class Reads(GlobMatchExpressionBase): for name in matchable.read_dependency_names()) +class InKernel(GlobMatchExpressionBase): + def __call__(self, kernel, matchable): + return self.re.match(kernel.name) + + class Iname(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) @@ -299,6 +307,10 @@ def parse_match(expr): result = Reads(pstate.next_match_obj().group(1)) pstate.advance() return result + elif next_tag is _in_kernel: + result = InKernel(pstate.next_match_obj().group(1)) + pstate.advance() + return result elif next_tag is _iname: result = Iname(pstate.next_match_obj().group(1)) pstate.advance() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 23c4b7fbd9e55006dd17ed9b127e598a14ee17a2..d3b8ef8a3fcd2b80f73d167f12f8e0506ce9c4e4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -22,12 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -37,22 +38,32 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import RuleAwareIdentityMapper -import logging -logger = logging.getLogger(__name__) +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + +from pytools import ProcessLogger # {{{ prepare for caching +@iterate_over_kernels_if_given_program def prepare_for_caching(kernel): import loopy as lp + from loopy.types import OpaqueType new_args = [] tgt = kernel.target for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + if (dtype is not None + and not isinstance(dtype, OpaqueType) + and dtype is not lp.auto + and dtype.target is not tgt): arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt) new_args.append(arg) @@ -744,7 +755,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): # }}} - from loopy.kernel.instruction import CallInstruction + from loopy.kernel.instruction import CallInstruction, is_array_call for insn in kernel.instructions: if not isinstance(insn, CallInstruction): continue @@ -752,6 +763,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if len(insn.assignees) <= 1: continue + if is_array_call(insn.assignees, insn.expression): + continue + assignees = insn.assignees assignee_var_names = insn.assignee_var_names() @@ -882,9 +896,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction_for_single_kernel(kernel, callables_table, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1004,7 +1018,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1122,7 +1136,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1362,7 +1376,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1451,17 +1465,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1660,15 +1674,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( + arg_dtypes, reduction_dtypes, callables_table = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1777,15 +1791,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1804,12 +1820,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1842,9 +1859,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + callables_table=callables_table, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = ( + cb_mapper(insn.expression, + callables_table=callables_table),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1922,7 +1943,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - kernel = lp.tag_inames(kernel, new_iname_tags) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, new_iname_tags) # TODO: remove unused inames... @@ -1932,6 +1954,31 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return kernel + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = realize_reduction_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # }}} @@ -2105,17 +2152,154 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Infers the :attr:`loopy` + """ + + def __init__(self, rule_mapping_context, caller_kernel, + callables_table): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.callables_table = callables_table + + def map_call(self, expr, expn_state, **kwargs): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import ResolvedFunction + + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + + arg_id_to_val = dict(enumerate(expr.parameters)) + if isinstance(expr, CallWithKwargs): + arg_id_to_val.update(expr.kw_parameters) + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + for i, arg in enumerate(assignees): + arg_id_to_val[-i-1] = arg + + from loopy.kernel.function_interface import get_arg_descriptor_for_expression + arg_id_to_descr = dict( + (arg_id, get_arg_descriptor_for_expression( + self.caller_kernel, arg)) + for arg_id, arg in six.iteritems(arg_id_to_val)) + + # specializing the function according to the parameter description + in_knl_callable = self.callables_table[expr.function.name] + new_in_knl_callable, self.callables_table, new_vars = ( + in_knl_callable.with_descrs( + arg_id_to_descr, self.caller_kernel, + self.callables_table, expr)) + + self.callables_table, new_func_id = ( + self.callables_table.with_callable( + expr.function.function, + new_in_knl_callable)) + + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)+new_vars) + else: + # FIXME: Order for vars when kwards are present? + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + map_call_with_kwargs = map_call + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn, assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, callables_table): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, callables_table) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table + + +def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ + root_kernel_callable = program.callables_table[program.name] + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel = program.root_kernel + + new_root_kernel, callables_table = traverse_to_infer_arg_descr( + root_kernel, callables_table) + new_root_kernel_callable = root_kernel_callable.copy( + subkernel=new_root_kernel) + callables_table, _ = callables_table.with_callable(program.name, + new_root_kernel_callable) + + callables_table = callables_table.with_exit_edit_callables_mode( + old_callables_count) + + return program.copy(callables_table=callables_table) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) - +def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2135,7 +2319,7 @@ def preprocess_kernel(kernel, device=None): # }}} - logger.info("%s: preprocess start" % kernel.name) + prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) @@ -2158,8 +2342,6 @@ def preprocess_kernel(kernel, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2174,8 +2356,8 @@ def preprocess_kernel(kernel, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - - kernel = realize_reduction(kernel, unknown_types_ok=False) + kernel = realize_reduction_for_single_kernel(kernel, + callables_table, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2195,11 +2377,11 @@ def preprocess_kernel(kernel, device=None): kernel = kernel.target.preprocess(kernel) - logger.info("%s: preprocess done" % kernel.name) - kernel = kernel.copy( state=KernelState.PREPROCESSED) + prepro_logger.done() + # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working @@ -2219,4 +2401,94 @@ def preprocess_kernel(kernel, device=None): return kernel + +# {{{ hw axes inference + +def infer_hw_axes_sizes(program): + """ + Returns copy of *program* with the hardware axes sizes inferred. + + .. note:: + + - Firstly, computes the collective hardware axes sizes from all the + callable kernels. + - Then, overrides the grid sizes of all the callable kernels to the + collective value. + """ + + global_size, local_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_inferred = {} + + for func_id, in_knl_callable in ( + program.callables_table.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable.with_hw_axes_sizes(global_size, local_size)) + + new_callables_table = ( + program.callables_table.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) + + return program.copy(callables_table=new_callables_table) + +# }}} + + +def preprocess_program(program, device=None): + + if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + # {{{ preprocess callable kernels + + # Callable editing restrictions: + # + # - should not edit callables_table in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] + # + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.callables_table, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + program = program.copy(callables_table=new_callables_table) + + # }}} + + # infer arg descrs of the callables + program = infer_arg_descr(program) + + program = infer_hw_axes_sizes(program) + + return program + + +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program + + # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 0000000000000000000000000000000000000000..1fb691531768bcf202327e485daa03f842356782 --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,1003 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import re + +from pytools import ImmutableRecord, memoize_method +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleExpander) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction + +from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash +from collections import Counter +from pymbolic.primitives import Call, CallWithKwargs + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program +.. autoclass:: CallablesTable + +.. autofunction:: make_program +.. autofunction:: iterate_over_kernels_if_given_program + +""" + + +class ResolvedFunctionMarker(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel, callables_table, + function_id_to_in_knl_callable_mappers): + super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + self.kernel = kernel + self.callables_table = callables_table + self.function_id_to_in_knl_callable_mappers = ( + function_id_to_in_knl_callable_mappers) + + def find_in_knl_callable_from_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for func_id_to_in_knl_callable_mapper in ( + self.function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + self.kernel.target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + def map_call(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if not isinstance(expr.function, ResolvedFunction): + + # search the kernel for the function. + in_knl_callable = self.find_in_knl_callable_from_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ResolvedFunction with the + # resolved in-kernel callable + + self.callables_table, new_func_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) + return type(expr)( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + for func_id in ( + expr.operation.get_scalar_callables()): + in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) + assert in_knl_callable is not None + self.callables_table, _ = ( + self.callables_table.with_added_callable(func_id, + in_knl_callable)) + return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + + +def _default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + from loopy.library.function import ( + loopy_specific_callable_func_id_to_knl_callable_mappers) + return ( + [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( + target.get_device_ast_builder().function_id_in_knl_callable_mapper( + ))) + + +def initialize_callables_table_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.CallablesTable`, by resolving + the functions based on :mod:`loopy`'s default function resolvers. + """ + # collect the default function resolvers + func_id_to_kernel_callable_mappers = ( + _default_func_id_to_kernel_callable_mappers(kernel.target)) + callables_table = CallablesTable({}) + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, callables_table, + func_id_to_kernel_callable_mappers) + + # mark the functions as "Resolved" in the expression nodes. + kernel_with_functions_resolved = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + # collect the update callables_table + callables_table = resolved_function_marker.callables_table + + callable_kernel = CallableKernel(kernel_with_functions_resolved) + + # add the callable kernel to the callables_table + callables_table, _ = callables_table.with_added_callable( + Variable(kernel.name), callable_kernel) + + return callables_table + + +# {{{ program + +class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: name + + An instance of :class:`str`, also the name of the top-most level + :class:`loopy.LoopKernel`. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A list of functions of the signature ``(target: TargetBase, + function_indentifier: str)`` that would return an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommended to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. automethod:: with_root_kernel + .. method:: __getitem__(name) + + Look up the resolved callable with identifier *name*. + """ + def __init__(self, + name, + callables_table, + target, + func_id_to_in_knl_callable_mappers): + assert isinstance(callables_table, CallablesTable) + + assert name in callables_table + + super(Program, self).__init__( + name=name, + callables_table=callables_table, + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "name", + "callables_table", + "target",) + + update_persistent_hash = update_persistent_hash + + def copy(self, **kwargs): + if 'target' in kwargs: + # target attribute of all the callable kernels should be updated. + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.callables_table.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + callables_table = new_self.callables_table.copy( + resolved_functions=new_resolved_functions) + + return super(Program, new_self).copy( + callables_table=callables_table) + else: + return super(Program, self).copy(**kwargs) + + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.root_kernel.get_grid_size_upper_bounds( + self.callables_table, + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( + self.callables_table, + ignore_auto=ignore_auto) + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + + @property + def root_kernel(self): + """ + Returns an instance of :class:`loopy.LoopKernel` denoting the topmost + level kernel. + """ + return self.callables_table[self.name].subkernel + + @property + def arg_dict(self): + """ + Returns ``arg_dict`` of the ``root_kernel``. + """ + return self.root_kernel.arg_dict + + @property + def args(self): + """Returns ``args`` of the ``root_kernel``.""" + return self.root_kernel.args[:] + + def with_root_kernel(self, root_kernel): + """:returns: a copy of *self* with the topmost level kernel as + *root_kernel*. + """ + assert self.name == root_kernel.name + return self.with_kernel(root_kernel) + + def with_kernel(self, kernel): + # FIXME: Currently only replaces kernel. Should also work for adding. + # FIXME: Document + new_in_knl_callable = self.callables_table[kernel.name].copy( + subkernel=kernel) + new_resolved_functions = self.callables_table.resolved_functions.copy() + new_resolved_functions[kernel.name] = new_in_knl_callable + return self.copy( + callables_table=self.callables_table.copy( + resolved_functions=new_resolved_functions)) + + def __iter__(self): + return six.iterkeys(self.callables_table.resolved_functions) + + def __getitem__(self, name): + result = self.callables_table[name] + if isinstance(result, CallableKernel): + return result.subkernel + else: + return result + + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + # FIXME: do a topological sort by the call graph + + def strify_callable(clbl): + if isinstance(clbl, CallableKernel): + return str(clbl.subkernel) + else: + return str(clbl) + + return "\n".join( + strify_callable(clbl) + for name, clbl in self.callables_table.items()) + +# }}} + + +def next_indexed_function_identifier(function_id): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``'sin_0'`` will return ``'sin_1'``. + + :arg function_id: Either an instance of :class:`str`. + """ + + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function_id) + + if match is None: + if function_id[-1] == '_': + return "{old_name}0".format(old_name=function_id) + else: + return "{old_name}_0".format(old_name=function_id) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ + def __init__(self, rule_mapping_context, renaming_dict): + super(ResolvedFunctionRenamer, self).__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super(ResolvedFunctionRenamer, self).map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + +# {{{ counting helpers + +class CallablesCountingMapper(CombineMapper): + """ + Returns an instance of :class:`collections.Counter` with the count of + callables registered in *callables_table*. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + """ + def __init__(self, callables_table): + self.callables_table = callables_table + + def combine(self, values): + return sum(values, Counter()) + + def map_call(self, expr): + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + if isinstance(expr.function, (ResolvedFunction)): + in_knl_callable = self.callables_table[expr.function.name] + if isinstance(in_knl_callable, ScalarCallable): + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + elif isinstance(in_knl_callable, CallableKernel): + + # callable kernels have more callables in them. + callables_count_in_subkernel = ( + count_callables_in_kernel( + in_knl_callable.subkernel, + self.callables_table)) + + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + ( + callables_count_in_subkernel) + else: + raise NotImplementedError("Unknown callable type %s." % ( + type)) + else: + return ( + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call + + def map_reduction(self, expr): + return Counter(expr.operation.get_scalar_callables()) + ( + super(CallablesCountingMapper, self).map_reduction(expr)) + + def map_constant(self, expr): + return Counter() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +@memoize_method +def count_callables_in_kernel(kernel, callables_table): + """ + Returns an instance of :class:`collections.Counter` representing the number + of callables in the *kernel* that are registered in + *callables_table*. + """ + assert isinstance(kernel, LoopKernel) + callables_count = Counter() + callables_counting_mapper = CallablesCountingMapper( + callables_table) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_count += ( + callables_counting_mapper(subst_expander( + insn.expression))) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction type %s." % ( + type(insn))) + + return callables_count + +# }}} + + +# {{{ program callables info + +class CallablesTable(ImmutableRecord): + # FIXME: is CallablesTable a better name?(similar to symbol table in + # compilers.) + """ + Records the information of all the callables called in a :class:`loopy.Program`. + + .. attribute:: resolved_functions + + An instance of :class:`dict` that contains a mapping from function + identifier to instances of + :class:`loopy.kernel.function_interface.InKernelCallable` + + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) + + .. attribute:: is_being_edited + + An instance of :class:`bool` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + + .. automethod:: __init__ + .. automethod:: callables_count + .. automethod:: with_added_callable + .. automethod:: with_edit_callables_mode + .. automethod:: with_callable + .. automethod:: with_exit_edit_callables_mode + """ + def __init__(self, resolved_functions, + history=None, is_being_edited=False): + + if history is None: + history = dict((func_id, frozenset([func_id])) for func_id in + resolved_functions) + + super(CallablesTable, self).__init__( + resolved_functions=resolved_functions, + history=history, + is_being_edited=is_being_edited) + + hash_fields = ( + "resolved_functions", + "is_being_edited", + "history") + + def __hash__(self): + return hash(( + frozenset(six.iteritems(self.resolved_functions)), + frozenset(six.iteritems(self.history)), + self.is_being_edited + )) + + update_persistent_hash = update_persistent_hash + + @property + @memoize_method + def callables_count(self): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in callables_table. + """ + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(self[ + root_kernel_name].subkernel, self)) + + return callables_count + + # {{{ interface to perform edits on callables + + def with_added_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` of ``(new_self, new_function)``. + ``new_self`` is a copy of *self* with the *function* associated with the + *in_kernel_callable*. ``new_function`` is the function identifier that + should be noted in the expression node so that it could be associated + with an instance of :class:`InKernelCallable`. + + .. note:: + + - Always checks whether the + :attr:``loopy.CallablesTable.resolved_functions` has + *in_kernel_callable*, does not introduce copies. + + - The difference between + :meth:`loopy.CallablesTable.with_added_callable` + and :meth:`CallablesTable.with_callable` being that + the former has no support for renaming the callable back i.e. + ``with_callable`` supports renaming from ``sin_0`` to ``sin``, + if possible, through the member method + ``loopy.CallablesTable.with_exit_edit_callables_mode`` + + This subtle difference makes -- + + - :meth:`loopy.CallablesTable.with_added_callable` suitable + for usage while resolving the functions first time, where no + renaming is needed. + + - :meth:`loopy.CallablesTable.with_callable` suitable for + implementing edits in callables during inference-walks. + """ + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | frozenset([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) + + def with_edit_callables_mode(self): + """ + Returns a copy of *self* for a walk traversal through all the callables. + """ + return self.copy( + is_being_edited=True) + + def with_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Also refer -- :meth:`loopy.CallablesTable.with_added_callable` + + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callable: An instance of + :class:`loopy.InKernelCallable`. + + .. note:: + + - Use :meth:`with_added_callable` if a callable is being resolved for the + first time. + """ + + # {{{ non-edit mode + + if not self.is_being_edited: + if function.name in self.resolved_functions and ( + self.resolved_functions[function.name] == in_kernel_callable): + # if not being edited, check that the given function is + # equal to the old version of the callable. + return self, function + else: + print('Old: ', self.resolved_functions[function.name]) + print('New: ', in_kernel_callable) + raise LoopyError("Use 'with_enter_edit_callables_mode' first.") + + # }}} + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + + # the callable already exists, hence return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | frozenset([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + unique_function_identifier = function.name + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = ( + history[function.name] | frozenset([unique_function_identifier])) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) + + def with_exit_edit_callables_mode(self, old_callables_count): + """ + Returns a copy of *self* with renaming of the callables done whenever + possible. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the callable + is renamed back to ``sin``. + """ + + assert self.is_being_edited + + new_callables_count = self.callables_count + + # {{{ calculate the renames needed + + renames_needed = {} + for old_func_id in old_callables_count-new_callables_count: + # this implies that all the function instances having the name + # "func_id" have been renamed to something else. + for new_func_id in ( + six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): + if old_func_id in self.history[new_func_id]: + renames_needed[new_func_id] = old_func_id + break + # }}} + + new_resolved_functions = {} + new_history = {} + + for func_id in new_callables_count: + in_knl_callable = self.resolved_functions[func_id] + if isinstance(in_knl_callable, CallableKernel): + # if callable kernel, perform renames inside its expressions. + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, renames_needed) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + if func_id in renames_needed: + new_func_id = renames_needed[func_id] + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = (in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=new_func_id))) + new_resolved_functions[new_func_id] = ( + in_knl_callable) + new_history[new_func_id] = self.history[func_id] + else: + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=func_id)) + new_resolved_functions[func_id] = in_knl_callable + new_history[func_id] = self.history[func_id] + + return self.copy( + is_being_edited=False, + resolved_functions=new_resolved_functions, + history=new_history) + + # }}} + + # {{{ behave like a dict(syntactic sugar) + + def __getitem__(self, item): + return self.resolved_functions[item] + + def __contains__(self, item): + return item in self.resolved_functions + + def items(self): + return six.iteritems(self.resolved_functions) + + def values(self): + return six.itervalues(self.resolved_functions) + + def keys(self): + return six.iterkeys(self.resolved_functions) + + # }}} + +# }}} + + +# {{{ helper functions + +def make_program(kernel): + """ + Returns an instance of :class:`loopy.Program` with the *kernel* as the root + kernel. + """ + + # get the program callables info + callables_table = initialize_callables_table_from_kernel(kernel) + + # get the program from program callables info + program = Program( + name=kernel.name, + callables_table=callables_table, + func_id_to_in_knl_callable_mappers=( + _default_func_id_to_kernel_callable_mappers(kernel.target)), + target=kernel.target) + + return program + + +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ + def _collective_transform(program_or_kernel, *args, **kwargs): + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel, *args, **kwargs) + + return wraps(transform_for_single_kernel)(_collective_transform) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 0983c5e0d513d51a04a3f6cf3033904435ef1412..5348443c66127baea5068c0bc5bd491abd4b4678 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1644,16 +1644,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse): return insn_ids_alive_at_scope[-1] -def append_barrier_or_raise_error(schedule, dep, verify_only): +def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only): if verify_only: from loopy.diagnostic import MissingBarrierError raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " + "%s: Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " "instruction option to state that no " "synchronization is needed)" % ( + kernel_name, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id), dep.variable, @@ -1724,7 +1725,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 for dep in chain.from_iterable( dep_tracker.gen_dependencies_with_target_at(insn) for insn in loop_head): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) # This barrier gets inserted outside the loop, hence it is # executed unconditionally and so kills all sources before # the loop. @@ -1756,7 +1758,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 elif isinstance(sched_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) dep_tracker.discard_all_sources() break result.append(sched_item) @@ -1822,7 +1825,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1835,18 +1838,19 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): + for sched in generate_loop_schedules_inner(kernel, + callables_table, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -1958,7 +1962,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2015,7 +2020,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2025,19 +2030,19 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, callables_table): warn_with_kernel( kernel, "get_one_scheduled_kernel_deprecated", "get_one_scheduled_kernel is deprecated. " "Use get_one_linearized_kernel instead.", DeprecationWarning) - return get_one_linearized_kernel(kernel) + return get_one_linearized_kernel(kernel, callables_table) -def get_one_linearized_kernel(kernel): +def get_one_linearized_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2055,7 +2060,8 @@ def get_one_linearized_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..86f39e55bd0e5de2773ee3b5b42a08885191a9c6 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,6 +1,10 @@ from __future__ import division, absolute_import, print_function -__copyright__ = "Copyright (C) 2015 James Stevens" +__copyright__ = """ +Copyright (C) 2015 James Stevens +Copyright (C) 2018 Kaushik Kulkarni +Copyright (C) 2019 Andreas Kloeckner +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,18 +26,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from functools import partial import six import loopy as lp from islpy import dim_type import islpy as isl from pymbolic.mapper import CombineMapper -from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method +from loopy.kernel.function_interface import CallableKernel +from loopy.kernel import LoopKernel +from loopy.program import make_program __doc__ = """ @@ -41,6 +48,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: ToCountMap +.. autoclass:: ToCountPolynomialMap .. autoclass:: CountGranularity .. autoclass:: Op .. autoclass:: MemAccess @@ -60,13 +68,50 @@ __doc__ = """ """ +# FIXME: +# - The SUBGROUP granularity is completely broken if the root kernel +# contains the grid and the operations get counted in the callee. +# To test, most of those are set to WORKITEM instead below (marked +# with FIXMEs). This leads to value mismatches and key errors in +# the tests. +# - Currently, nothing prevents summation across different +# granularities, which is guaranteed to yield bogus results. +# - AccessFootprintGatherer needs to be redone to match get_op_map and +# get_mem_access_map style +# - Test for the subkernel functionality need to be written + + +def get_kernel_parameter_space(kernel): + return isl.Space.create_from_names(kernel.isl_context, + set=[], params=sorted(list(kernel.outer_params()))).params() + + +def get_kernel_zero_pwqpolynomial(kernel): + space = get_kernel_parameter_space(kernel) + space = space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) + + # {{{ GuardedPwQPolynomial +def _get_param_tuple(obj): + return tuple( + obj.get_dim_name(dim_type.param, i) + for i in range(obj.dim(dim_type.param))) + + class GuardedPwQPolynomial(object): def __init__(self, pwqpolynomial, valid_domain): self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain + assert (_get_param_tuple(pwqpolynomial.space) + == _get_param_tuple(valid_domain.space)) + + @property + def space(self): + return self.valid_domain.space + def __add__(self, other): if isinstance(other, GuardedPwQPolynomial): return GuardedPwQPolynomial( @@ -115,7 +160,7 @@ class GuardedPwQPolynomial(object): return str(self.pwqpolynomial) def __repr__(self): - return repr(self.pwqpolynomial) + return "Guarded" + repr(self.pwqpolynomial) # }}} @@ -123,7 +168,20 @@ class GuardedPwQPolynomial(object): # {{{ ToCountMap class ToCountMap(object): - """Maps any type of key to an arithmetic type. + """A map from work descriptors like :class:`Op` and :class:`MemAccess` + to any arithmetic type. + + .. automethod:: __getitem__ + .. automethod:: __str__ + .. automethod:: __repr__ + .. automethod:: __len__ + .. automethod:: get + .. automethod:: items + .. automethod:: keys + .. automethod:: values + + .. automethod:: copy + .. automethod:: with_set_attributes .. automethod:: filter_by .. automethod:: filter_by_func @@ -134,17 +192,20 @@ class ToCountMap(object): """ - def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): - if init_dict is None: - init_dict = {} - self.count_map = init_dict - self.val_type = val_type + def __init__(self, count_map=None): + if count_map is None: + count_map = {} + + self.count_map = count_map + + def _zero(self): + return 0 def __add__(self, other): result = self.count_map.copy() for k, v in six.iteritems(other.count_map): result[k] = self.count_map.get(k, 0) + v - return ToCountMap(result, self.val_type) + return self.copy(count_map=result) def __radd__(self, other): if other != 0: @@ -152,36 +213,28 @@ class ToCountMap(object): "to {0} {1}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) + return self def __mul__(self, other): - if isinstance(other, GuardedPwQPolynomial): - return ToCountMap(dict( - (index, self.count_map[index]*other) - for index in self.keys())) - else: - raise ValueError("ToCountMap: Attempted to multiply " - "ToCountMap by {0} {1}." - .format(type(other), other)) + return self.copy(dict( + (index, value*other) + for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ def __getitem__(self, index): - try: - return self.count_map[index] - except KeyError: - #TODO what is the best way to handle this? - if self.val_type is GuardedPwQPolynomial: - return GuardedPwQPolynomial.zero() - else: - return 0 - - def __setitem__(self, index, value): - self.count_map[index] = value + return self.count_map[index] def __repr__(self): return repr(self.count_map) + def __str__(self): + return "\n".join( + "%s: %s" % (k, v) + for k, v in sorted(six.iteritems(self.count_map), + key=lambda k: str(k))) + def __len__(self): return len(self.count_map) @@ -194,17 +247,19 @@ class ToCountMap(object): def keys(self): return self.count_map.keys() - def pop(self, item): - return self.count_map.pop(item) + def values(self): + return self.count_map.values() + + def copy(self, count_map=None): + if count_map is None: + count_map = self.count_map - def copy(self): - return ToCountMap(dict(self.count_map), self.val_type) + return type(self)(count_map=count_map) def with_set_attributes(self, **kwargs): - return ToCountMap(dict( + return self.copy(count_map=dict( (key.copy(**kwargs), val) - for key, val in six.iteritems(self.count_map)), - self.val_type) + for key, val in six.iteritems(self.count_map))) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -231,28 +286,25 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) - - from loopy.types import to_loopy_type - if 'dtype' in kwargs.keys(): - kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] - - # for each item in self.count_map - for self_key, self_val in self.items(): - try: - # check to see if key attribute values match all filters - for arg_field, allowable_vals in kwargs.items(): - attr_val = getattr(self_key, arg_field) - # see if the value is in the filter list - if attr_val not in allowable_vals: - break - else: # loop terminated without break or error - result_map[self_key] = self_val - except(AttributeError): - # the field passed is not a field of this key - continue - - return result_map + new_count_map = {} + + class _Sentinel: + pass + + new_kwargs = {} + for arg_field, allowable_vals in six.iteritems(kwargs): + if arg_field == "dtype": + from loopy.types import to_loopy_type + allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals] + + new_kwargs[arg_field] = allowable_vals + + for key, val in six.iteritems(self.count_map): + if all(getattr(key, arg_field, _Sentinel) in allowable_vals + for arg_field, allowable_vals in six.iteritems(new_kwargs)): + new_count_map[key] = val + + return self.copy(count_map=new_count_map) def filter_by_func(self, func): """Keep items that pass a test. @@ -279,14 +331,13 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - # for each item in self.count_map, call func on the key - for self_key, self_val in self.items(): + for self_key, self_val in six.iteritems(self.count_map): if func(self_key): - result_map[self_key] = self_val + new_count_map[self_key] = self_val - return result_map + return self.copy(count_map=new_count_map) def group_by(self, *args): """Group map items together, distinguishing by only the key fields @@ -334,7 +385,7 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} # make sure all item keys have same type if self.count_map: @@ -343,22 +394,17 @@ class ToCountMap(object): raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") else: - return result_map + return self - # for each item in self.count_map - for self_key, self_val in self.items(): - new_key = key_type() + for self_key, self_val in six.iteritems(self.count_map): + new_key = key_type( + **dict( + (field, getattr(self_key, field)) + for field in args)) - # set all specified fields - for field in args: - setattr(new_key, field, getattr(self_key, field)) - - if new_key in result_map.keys(): - result_map[new_key] += self_val - else: - result_map[new_key] = self_val + new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val - return result_map + return self.copy(count_map=new_count_map) def to_bytes(self): """Convert counts to bytes using data type in map key. @@ -391,48 +437,74 @@ class ToCountMap(object): """ - result = self.copy() + new_count_map = {} - for key, val in self.items(): - bytes_processed = int(key.dtype.itemsize) * val - result[key] = bytes_processed + for key, val in six.iteritems(self.count_map): + new_count_map[key] = int(key.dtype.itemsize) * val - #TODO again, is this okay? - result.val_type = int - - return result + return self.copy(new_count_map) def sum(self): - """Add all counts in ToCountMap. - - :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the - sum of counts. + """:return: A sum of the values of the dictionary.""" - """ - - if self.val_type is GuardedPwQPolynomial: - total = GuardedPwQPolynomial.zero() - else: - total = 0 + total = self._zero() - for k, v in self.items(): + for k, v in six.iteritems(self.count_map): total += v + return total - #TODO test and document - def eval(self, params): - result = self.copy() - for key, val in self.items(): - result[key] = val.eval_with_dict(params) - result.val_type = int - return result +# }}} - def eval_and_sum(self, params): - """Add all counts in :class:`ToCountMap` and evaluate with provided - parameter dict. - :return: An :class:`int` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided. +# {{{ ToCountPolynomialMap + +class ToCountPolynomialMap(ToCountMap): + """Maps any type of key to a :class:`islpy.PwQPolynomial` or a + :class:`GuardedPwQPolynomial`. + """ + + def __init__(self, space, count_map=None): + if not isinstance(space, isl.Space): + raise TypeError( + "first argument to ToCountPolynomialMap must be " + "of type islpy.Space") + + assert space.is_params() + self.space = space + + space_param_tuple = _get_param_tuple(space) + + for key, val in six.iteritems(count_map): + if isinstance(val, isl.PwQPolynomial): + assert val.dim(dim_type.out) == 1 + elif isinstance(val, GuardedPwQPolynomial): + assert val.pwqpolynomial.dim(dim_type.out) == 1 + else: + raise TypeError("unexpected value type") + + assert _get_param_tuple(val.space) == space_param_tuple + + super(ToCountPolynomialMap, self).__init__(count_map) + + def _zero(self): + space = self.space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) + + def copy(self, count_map=None, space=None): + if count_map is None: + count_map = self.count_map + + if space is None: + space = self.space + + return type(self)(space, count_map) + + def eval_and_sum(self, params=None): + """Add all counts and evaluate with provided parameter dict *params* + + :return: An :class:`int` containing the sum of all counts + evaluated with the parameters provided. Example usage:: @@ -447,19 +519,70 @@ class ToCountMap(object): # (now use these counts to, e.g., predict performance) """ + if params is None: + params = {} + return self.sum().eval_with_dict(params) # }}} +# {{{ subst_into_to_count_map + +def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial, get_param_subst_domain + + poly = subst_into_pwqpolynomial( + new_space, guarded_poly.pwqpolynomial, subst_dict) + + valid_domain = guarded_poly.valid_domain + i_begin_subst_space = valid_domain.dim(dim_type.param) + + valid_domain, subst_domain, _ = get_param_subst_domain( + new_space, guarded_poly.valid_domain, subst_dict) + + valid_domain = valid_domain & subst_domain + valid_domain = valid_domain.project_out(dim_type.param, 0, i_begin_subst_space) + return GuardedPwQPolynomial(poly, valid_domain) + + +def subst_into_to_count_map(space, tcm, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial + new_count_map = {} + for key, value in six.iteritems(tcm.count_map): + if isinstance(value, GuardedPwQPolynomial): + new_count_map[key] = subst_into_guarded_pwqpolynomial( + space, value, subst_dict) + + elif isinstance(value, isl.PwQPolynomial): + new_count_map[key] = subst_into_pwqpolynomial(space, value, subst_dict) + + elif isinstance(value, int): + new_count_map[key] = value + + else: + raise ValueError("unexpected value type") + + return tcm.copy(space=space, count_map=new_count_map) + +# }}} + + def stringify_stats_mapping(m): + + from warnings import warn + warn("stringify_stats_mapping is deprecated and will be removed in 2020." + " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2) + result = "" for key in sorted(m.keys(), key=lambda k: str(k)): result += ("%s : %s\n" % (key, m[key])) return result -class CountGranularity: +# {{{ CountGranularity + +class CountGranularity(object): """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -485,10 +608,12 @@ class CountGranularity: WORKGROUP = "workgroup" ALL = [WORKITEM, SUBGROUP, WORKGROUP] +# }}} + # {{{ Op descriptor -class Op(Record): +class Op(ImmutableRecord): """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -514,34 +639,41 @@ class Op(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ - def __init__(self, dtype=None, name=None, count_granularity=None): + def __init__(self, dtype=None, name=None, count_granularity=None, + kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, dtype=dtype, name=name, - count_granularity=count_granularity) - else: + + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, dtype=to_loopy_type(dtype), name=name, - count_granularity=count_granularity) + dtype = to_loopy_type(dtype) - def __hash__(self): - return hash(repr(self)) + super(Op, self).__init__(dtype=dtype, name=name, + count_granularity=count_granularity, + kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) + if self.kernel_name is not None: + return "Op(%s, %s, %s, %s)" % ( + self.dtype, self.name, self.count_granularity, self.kernel_name) + else: + return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) # }}} # {{{ MemAccess descriptor -class MemAccess(Record): +class MemAccess(ImmutableRecord): """A descriptor for a type of memory access. .. attribute:: mtype @@ -600,38 +732,38 @@ class MemAccess(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, direction=None, variable=None, variable_tag=None, - count_granularity=None): + count_granularity=None, kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, - gid_strides=gid_strides, direction=direction, - variable=variable, variable_tag=variable_tag, - count_granularity=count_granularity) - else: + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, gid_strides=gid_strides, - direction=direction, variable=variable, - variable_tag=variable_tag, - count_granularity=count_granularity) + dtype = to_loopy_type(dtype) + + super(MemAccess, self).__init__(mtype=mtype, dtype=dtype, + lid_strides=lid_strides, gid_strides=gid_strides, + direction=direction, variable=variable, + variable_tag=variable_tag, + count_granularity=count_granularity, + kernel_name=kernel_name) def __hash__(self): - # Note that this means lid_strides and gid_strides must be sorted - # in self.__repr__() + # dicts in gid_strides and lid_strides aren't natively hashable return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % ( + return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -641,33 +773,101 @@ class MemAccess(Record): self.direction, self.variable, self.variable_tag, - self.count_granularity) + self.count_granularity, + self.kernel_name) + +# }}} + + +# {{{ Sync descriptor + +class Sync(ImmutableRecord): + """A descriptor for a type of synchronization. + + .. attribute:: kind + + A string describing the synchronization kind, e.g. ``"barrier_global"`` or + ``"barrier_local"`` or ``"kernel_launch"``. + + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ + + def __init__(self, kind=None, kernel_name=None): + super(Sync, self).__init__(kind=kind, kernel_name=kernel_name) + + def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness + return "Sync(%s, %s)" % (self.kind, self.kernel_name) # }}} -# {{{ counter base +# {{{ CounterBase class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, callables_table, kernel_rec): self.knl = knl + self.callables_table = callables_table + self.kernel_rec = kernel_rec + from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, callables_table) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 + + @property + @memoize_method + def param_space(self): + return get_kernel_parameter_space(self.knl) + + def new_poly_map(self, count_map): + return ToCountPolynomialMap(self.param_space, count_map) + + def new_zero_poly_map(self): + return self.new_poly_map({}) def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() def map_call(self, expr): - return self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + from loopy.kernel.data import ValueArg + if isinstance(clbl, CallableKernel): + sub_result = self.kernel_rec(clbl.subkernel) + + arg_dict = dict( + (arg.name, value) + for arg, value in zip( + clbl.subkernel.args, + expr.parameters) + if isinstance(arg, ValueArg)) + + return subst_into_to_count_map( + self.param_space, + sub_result, arg_dict) \ + + self.rec(expr.parameters) + + else: + raise NotImplementedError() + + def map_call_with_kwargs(self, expr): + # FIXME + raise NotImplementedError() def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) else: - return ToCountMap() + return self.new_zero_poly_map() map_product = map_sum @@ -696,8 +896,8 @@ class CounterBase(CombineMapper): map_derivative = map_common_subexpression map_slice = map_common_subexpression - # preprocessing should have removed these def map_reduction(self, expr): + # preprocessing should have removed these raise RuntimeError("%s encountered %s--not supposed to happen" % (type(self).__name__, type(expr).__name__)) @@ -707,60 +907,81 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, count_within_subscripts=True): - self.knl = knl + def __init__(self, knl, callables_table, kernel_rec, + count_within_subscripts=True): + super(ExpressionOpCounter, self).__init__( + knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + + arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() map_tagged_variable = map_constant map_variable = map_constant def map_call(self, expr): - return ToCountMap( - {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), - count_granularity=CountGranularity.SUBGROUP): 1} - ) + self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.new_poly_map( + {Op(dtype=self.type_inf(expr), + name='func:'+clbl.name, + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.parameters) + else: + return super(ExpressionOpCounter, self).map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: return self.rec(expr.index) else: - return ToCountMap() + return self.new_zero_poly_map() + + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() def map_sum(self, expr): assert expr.children - return ToCountMap( + return self.new_poly_map( {Op(dtype=self.type_inf(expr), name='add', - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1} + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(dtype=self.type_inf(expr), + return sum(self.new_poly_map({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.SUBGROUP): 1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): self.one}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(dtype=self.type_inf(expr), + self.new_poly_map({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.SUBGROUP): -1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): -self.one}) def map_quotient(self, expr, *args): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='div', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -768,32 +989,36 @@ class ExpressionOpCounter(CounterBase): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='pow', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='shift', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)}) \ + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -814,9 +1039,10 @@ class ExpressionOpCounter(CounterBase): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity=CountGranularity.SUBGROUP): + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -857,6 +1083,8 @@ class _IndexStrideCoefficientCollector(CoefficientCollector): # }}} +# {{{ _get_lid_and_gid_strides + def _get_lid_and_gid_strides(knl, array, index): # find all local and global index tags and corresponding inames from loopy.symbolic import get_dependencies @@ -925,28 +1153,49 @@ def _get_lid_and_gid_strides(knl, array, index): return get_iname_strides(lid_to_iname), get_iname_strides(gid_to_iname) +# }}} + -class MemAccessCounter(CounterBase): - pass +# {{{ MemAccessCounterBase + +class MemAccessCounterBase(CounterBase): + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() + + def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.rec(expr.parameters) + else: + return super(MemAccessCounterBase, self).map_call(expr) + +# }}} # {{{ LocalMemAccessCounter -class LocalMemAccessCounter(MemAccessCounter): +class LocalMemAccessCounter(MemAccessCounterBase): + local_mem_count_granularity = CountGranularity.SUBGROUP + def count_var_access(self, dtype, name, index): - sub_map = ToCountMap() + count_map = {} if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( array.address_space == AddressSpace.LOCAL): if index is None: # no subscript - sub_map[MemAccess( + count_map[MemAccess( mtype='local', dtype=dtype, - count_granularity=CountGranularity.SUBGROUP) - ] = 1 - return sub_map + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one + return self.new_poly_map(count_map) array = self.knl.temporary_variables[name] @@ -958,15 +1207,16 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - sub_map[MemAccess( + count_map[MemAccess( mtype='local', dtype=dtype, lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, - count_granularity=CountGranularity.SUBGROUP)] = 1 + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one - return sub_map + return self.new_poly_map(count_map) def map_variable(self, expr): return self.count_var_access( @@ -985,7 +1235,7 @@ class LocalMemAccessCounter(MemAccessCounter): # {{{ GlobalMemAccessCounter -class GlobalMemAccessCounter(MemAccessCounter): +class GlobalMemAccessCounter(MemAccessCounterBase): def map_variable(self, expr): name = expr.name @@ -993,17 +1243,18 @@ class GlobalMemAccessCounter(MemAccessCounter): array = self.knl.arg_dict[name] else: # this is a temporary variable - return ToCountMap() + return self.new_zero_poly_map() if not isinstance(array, lp.ArrayArg): # this array is not in global memory - return ToCountMap() + return self.new_zero_poly_map() - return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), lid_strides={}, - gid_strides={}, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) + return self.new_poly_map({MemAccess(mtype='global', + dtype=self.type_inf(expr), lid_strides={}, + gid_strides={}, variable=name, + count_granularity=CountGranularity.WORKITEM, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.index) def map_subscript(self, expr): name = expr.aggregate.name @@ -1029,19 +1280,27 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - count_granularity = CountGranularity.WORKITEM if ( - 0 in lid_strides and lid_strides[0] != 0 - ) else CountGranularity.SUBGROUP + global_access_count_granularity = CountGranularity.SUBGROUP - return ToCountMap({MemAccess( + # Account for broadcasts once per subgroup + count_granularity = CountGranularity.WORKITEM if ( + # if the stride in lid.0 is known + 0 in lid_strides + and + # it is nonzero + lid_strides[0] != 0 + ) else global_access_count_granularity + + return self.new_poly_map({MemAccess( mtype='global', dtype=self.type_inf(expr), lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, variable_tag=var_tag, - count_granularity=count_granularity - ): 1} + count_granularity=count_granularity, + kernel_name=self.knl.name, + ): self.one} ) + self.rec(expr.index_tuple) # }}} @@ -1117,7 +1376,9 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count def add_assumptions_guard(kernel, pwqpolynomial): - return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + return GuardedPwQPolynomial( + pwqpolynomial, + kernel.assumptions.align_params(pwqpolynomial.space)) def count(kernel, set, space=None): @@ -1219,9 +1480,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): +def get_unused_hw_axes_factor(knl, callables_table, insn, + disregard_local_axes): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1238,12 +1500,12 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): g_used.add(tag.axis) def mult_grid_factor(used_axes, size): - result = 1 + result = get_kernel_zero_pwqpolynomial(knl) + 1 + for iaxis, size in enumerate(size): if iaxis not in used_axes: if not isinstance(size, int): - if space is not None: - size = size.align_params(space) + size = size.align_params(result.space) size = isl.PwQPolynomial.from_pw_aff(size) @@ -1259,37 +1521,40 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return add_assumptions_guard(knl, result) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_inames_domain(knl, inames): + space = get_kernel_parameter_space(knl) + if not inames: + return get_kernel_zero_pwqpolynomial(knl) + 1 + + inames_domain = knl.get_inames_domain(inames) + domain = inames_domain.project_out_except(inames, [dim_type.set]) + return count(knl, domain, space=space) + + +def count_insn_runs(knl, callables_table, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = knl.insn_inames(insn) if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - - space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, - set=[], params=knl.outer_params()) - - c = count(knl, domain, space=space) + c = count_inames_domain(knl, insn_inames) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) + unused_fac = get_unused_hw_axes_factor(knl, callables_table, + insn, disregard_local_axes=disregard_local_axes) return c * unused_fac else: return c @memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1301,19 +1566,21 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, callables_table, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: - # get the group size + # {{{ compute workgroup_size + from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1324,15 +1591,18 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, % (CountGranularity.SUBGROUP, local_size)) workgroup_size *= s + # }}} + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for work-group size " "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" + "multiple device programs present, actual sub-group count may be " "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: # this should not happen since this is enforced in Op/MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" @@ -1344,17 +1614,55 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, +def _get_op_map_for_single_kernel(knl, callables_table, + count_redundant_work, + count_within_subscripts, subgroup_size): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_op_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) + + op_counter = ExpressionOpCounter(knl, callables_table, kernel_rec, + count_within_subscripts) + op_map = op_counter.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignees) + op_counter(insn.expression) + for key, val in six.iteritems(ops.count_map): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + op_map = op_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1372,7 +1680,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + ``'guess'`` is passed as the subgroup_size, :func:`get_op_map` will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1405,57 +1713,32 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - subgroup_size = _process_subgroup_size(knl, subgroup_size) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + if isinstance(program, LoopKernel): + program = make_program(program) - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, count_within_subscripts) - - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = preprocess_program(program) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - for key, val in six.iteritems(ops.count_map): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): - pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - if numpy_types: - return ToCountMap( - init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return _get_op_map_for_single_kernel( + program[program.name], program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) # }}} +# {{{ subgoup size finding + def _find_subgroup_size_for_knl(knl): from loopy.target.pyopencl import PyOpenCLTarget if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: @@ -1507,20 +1790,70 @@ def _process_subgroup_size(knl, subgroup_size_requested): "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size_requested)) +# }}} + # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, +def _get_mem_access_map_for_single_kernel(knl, callables_table, + count_redundant_work, subgroup_size): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_mem_access_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) + + access_counter_g = GlobalMemAccessCounter( + knl, callables_table, kernel_rec) + access_counter_l = LocalMemAccessCounter( + knl, callables_table, kernel_rec) + access_map = access_counter_g.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + insn_access_map = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + for assignee in insn.assignees: + insn_access_map = insn_access_map + ( + access_counter_g(insn.assignee) + + access_counter_l(insn.assignee) + ).with_set_attributes(direction="store") + + for key, val in six.iteritems(insn_access_map.count_map): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + access_map = access_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return access_map + + +def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, subgroup_size=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1596,87 +1929,82 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # (now use these counts to, e.g., predict performance) """ + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) + + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) + + return _get_mem_access_map_for_single_kernel( + program[program.name], program.callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) + +# }}} + + +# {{{ get_synchronization_map + +def _get_synchronization_map_for_single_kernel(knl, callables_table, + subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - subgroup_size = _process_subgroup_size(knl, subgroup_size) + knl = lp.get_one_scheduled_kernel(knl, callables_table) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, + CallKernel, ReturnFromKernel, RunInstruction) - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + kernel_rec = partial(_get_synchronization_map_for_single_kernel, + callables_table=callables_table, + subgroup_size=subgroup_size) - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + sync_counter = CounterBase(knl, callables_table, kernel_rec) + sync_map = sync_counter.new_zero_poly_map() - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - access_expr = ( - access_counter_g(insn.expression) - + access_counter_l(insn.expression) - ).with_set_attributes(direction="load") - - access_assignee = ( - access_counter_g(insn.assignee) - + access_counter_l(insn.assignee) - ).with_set_attributes(direction="store") - - for key, val in six.iteritems(access_expr.count_map): - - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) - - for key, val in six.iteritems(access_assignee.count_map): - - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + iname_list = [] - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + for sched_item in knl.schedule: + if isinstance(sched_item, EnterLoop): + if sched_item.iname: # (if not empty) + iname_list.append(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + if sched_item.iname: # (if not empty) + iname_list.pop() + + elif isinstance(sched_item, Barrier): + sync_map = sync_map + ToCountMap( + {Sync( + "barrier_%s" % sched_item.synchronization_kind, + knl.name): count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, RunInstruction): pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) - if numpy_types: - return ToCountMap( - init_dict=dict( - (MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - lid_strides=mem_access.lid_strides, - gid_strides=mem_access.gid_strides, - direction=mem_access.direction, - variable=mem_access.variable, - variable_tag=mem_access.variable_tag, - count_granularity=mem_access.count_granularity), - ct) - for mem_access, ct in six.iteritems(access_map.count_map)), - val_type=access_map.val_type - ) - else: - return access_map + elif isinstance(sched_item, CallKernel): + sync_map = sync_map + ToCountMap( + {Sync("kernel_launch", knl.name): + count_inames_domain(knl, frozenset(iname_list))}) -# }}} + elif isinstance(sched_item, ReturnFromKernel): + pass + else: + raise LoopyError("unexpected schedule item: %s" + % type(sched_item).__name__) -# {{{ get_synchronization_map + return sync_map -def get_synchronization_map(knl, subgroup_size=None): +def get_synchronization_map(program, subgroup_size=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1713,82 +2041,23 @@ def get_synchronization_map(knl, subgroup_size=None): """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, - CallKernel, ReturnFromKernel, RunInstruction) - from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - iname_list = [] - - result = ToCountMap() - - one = isl.PwQPolynomial('{ 1 }') - - def get_count_poly(iname_list): - if iname_list: # (if iname_list is not empty) - ct = (count(knl, ( - knl.get_inames_domain(iname_list). - project_out_except(iname_list, [dim_type.set]) - )), ) - return reduce(mul, ct) - else: - return one - - for sched_item in knl.schedule: - if isinstance(sched_item, EnterLoop): - if sched_item.iname: # (if not empty) - iname_list.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - if sched_item.iname: # (if not empty) - iname_list.pop() - - elif isinstance(sched_item, Barrier): - result = result + ToCountMap({"barrier_%s" % - sched_item.synchronization_kind: - get_count_poly(iname_list)}) + from loopy.preprocess import preprocess_program, infer_unknown_types - elif isinstance(sched_item, CallKernel): - result = result + ToCountMap( - {"kernel_launch": get_count_poly(iname_list)}) - - elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): - pass - - else: - raise LoopyError("unexpected schedule item: %s" - % type(sched_item).__name__) + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - return result + return _get_synchronization_map_for_single_kernel( + program[program.name], program.callables_table, + subgroup_size=subgroup_size) # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): - """Return a dictionary mapping ``(var_name, direction)`` to - :class:`islpy.Set` instances capturing which indices of each the array - *var_name* are read/written (where *direction* is either ``read`` or - ``write``. - - :arg ignore_uncountable: If *False*, an error will be raised for accesses - on which the footprint cannot be determined (e.g. data-dependent or - nonlinear indices) - """ - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - +def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable): write_footprints = [] read_footprints = [] @@ -1811,6 +2080,40 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False): + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.Set` instances capturing which indices of each the array + *var_name* are read/written (where *direction* is either ``read`` or + ``write``. + + :arg ignore_uncountable: If *False*, an error will be raised for accesses + on which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) + """ + + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) + + write_footprints = [] + read_footprints = [] + + write_footprints, read_footprints = _gather_access_footprints_for_single_kernel( + program[program.name], ignore_uncountable) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1825,7 +2128,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1836,12 +2139,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, @@ -1864,74 +2167,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): # }}} -# {{{ compat goop - -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "deprecated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_synchronization_poly(knl): - """Count the number of synchronization events each work-item encounters in - a loopy kernel. - - get_synchronization_poly is deprecated. Use get_synchronization_map - instead. - - """ - warn_with_kernel(knl, "deprecated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) - - -def get_op_poly(knl, numpy_types=True): - """Count the number of operations in a loopy kernel. - - get_op_poly is deprecated. Use get_op_map instead. - - """ - warn_with_kernel(knl, "deprecated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) - -# }}} - # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ccac5e199d2b53e202dd735ffd8dfe20a7dc29a2..b21b9937c680529a39d825d96822f780cf0f7f99 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -108,7 +108,14 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args, **kwargs): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_sub_array_ref(self, expr, *args, **kwargs): + return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), + self.rec(expr.subscript, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -167,15 +174,32 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_sub_array_ref(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.swept_inames, *args) + self.rec(expr.subscript, *args) + + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): def map_reduction(self, expr, *args, **kwargs): return self.rec(expr.expr, *args, **kwargs) + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + map_linear_subscript = CombineMapperBase.map_subscript @@ -234,6 +258,15 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + return expr.name + + def map_sub_array_ref(self, expr, prec): + return "[{inames}]: {subscr}".format( + inames=','.join(self.rec(iname, prec) for iname in + expr.swept_inames), + subscr=self.rec(expr.subscript, prec)) + class EqualityPreservingStringifyMapper(StringifyMapperBase): """ @@ -273,7 +306,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -309,8 +342,16 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args, **kwargs) for child in expr.parameters) + def map_call_with_kwargs(self, expr, *args): + # Loopy does not have first-class functions. Do not descend + # into 'function' attribute of Call. + return self.combine( + self.rec(child, *args) for child in expr.parameters+tuple( + expr.kw_parameters.values())) + def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) + return deps - set(p.Variable(iname) for iname in expr.inames) def map_tagged_variable(self, expr, *args, **kwargs): @@ -319,11 +360,18 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr, *args, **kwargs): return set() + def map_sub_array_ref(self, expr, *args): + deps = self.rec(expr.subscript, *args) + return deps - set(iname for iname in expr.swept_inames) + map_linear_subscript = DependencyMapperBase.map_subscript def map_type_cast(self, expr, *args, **kwargs): return self.rec(expr.child, *args, **kwargs) + def map_resolved_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -551,7 +599,6 @@ class Reduction(LoopyExpressionBase): across :attr:`inames`. .. attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` .. attribute:: inames @@ -678,6 +725,164 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") + +class ResolvedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ReductionOpFunction + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, ReductionOpFunction): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_resolved_function") + + +class EvaluatorWithDeficientContext(PartialEvaluationMapper): + """Evaluation Mapper that does not need values of all the variables + involved in the expression. + + Returns the expression with the values mapped from :attr:`context`. + """ + def map_variable(self, expr): + if expr.name in self.context: + return self.context[expr.name] + else: + return expr + + +class VariableInAnExpression(CombineMapper): + def __init__(self, variables_to_search): + assert(all(isinstance(variable, p.Variable) for variable in + variables_to_search)) + self.variables_to_search = variables_to_search + + def combine(self, values): + return any(values) + + def map_variable(self, expr): + return expr in self.variables_to_search + + def map_constant(self, expr): + return False + + +class SweptInameStrideCollector(CoefficientCollectorBase): + """ + Mapper to compute the coefficient swept inames for :class:`SubArrayRef`. + """ + def map_algebraic_leaf(self, expr): + # subscripts that are not involved in :attr:`target_names` are treated + # as constants. + if isinstance(expr, p.Subscript) and (self.target_names is None + or expr.aggregate.name not in self.target_names): + return {1: expr} + + return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) + + +class SubArrayRef(p.Expression): + """ + An algebraic expression to map an affine memory layout pattern (known as + sub-arary) as consecutive elements of the sweeping axes which are defined + using :attr:`SubArrayRef.swept_inames`. + + .. attribute:: swept_inames + + An instance of :class:`tuple` denoting the axes to which the sub array + is supposed to be mapper to. + + .. attribute:: subscript + + An instance of :class:`pymbolic.primitives.Subscript` denoting the + array in the kernel. + """ + + init_arg_names = ("swept_inames", "subscript") + + def __init__(self, swept_inames, subscript): + + # {{{ sanity checks + + if not isinstance(swept_inames, tuple): + assert isinstance(swept_inames, p.Variable) + swept_inames = (swept_inames,) + + assert isinstance(swept_inames, tuple) + + for iname in swept_inames: + assert isinstance(iname, p.Variable) + assert isinstance(subscript, p.Subscript) + + # }}} + + self.swept_inames = swept_inames + self.subscript = subscript + + def get_begin_subscript(self): + """ + Returns an instance of :class:`pymbolic.primitives.Subscript`, the + beginning subscript of the array swept by the *SubArrayRef*. + + **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning + subscript would be ``a[0, j, 0, l]`` + """ + # TODO: Set the zero to the minimum value of the iname. + swept_inames_to_zeros = dict( + (swept_iname.name, 0) for swept_iname in self.swept_inames) + + return EvaluatorWithDeficientContext(swept_inames_to_zeros)( + self.subscript) + + def __getinitargs__(self): + return (self.swept_inames, self.subscript) + + def get_hash(self): + return hash((self.__class__, self.swept_inames, self.subscript)) + + def is_equal(self, other): + return (other.__class__ == self.__class__ + and other.subscript == self.subscript + and other.swept_inames == self.swept_inames) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_sub_array_ref") + # }}} @@ -690,9 +895,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -890,12 +1098,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -951,7 +1161,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -960,7 +1170,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn @@ -1141,6 +1351,14 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + for par in expr.kw_parameters.values(): + if not isinstance(par, SubArrayRef): + raise LoopyError("Keyword Arguments is only supported for" + " array arguments--use positional order to specify" + " the order of the arguments in the call.") + return IdentityMapper.map_call_with_kwargs(self, expr) + # {{{ customization to pymbolic parser @@ -1171,8 +1389,10 @@ class LoopyParser(ParserBase): return float(val) # generic float def parse_prefix(self, pstate): - from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier, + _openbracket, _closebracket, _colon) import loopy as lp + if pstate.is_next(_less): pstate.advance() if pstate.is_next(_greater): @@ -1188,6 +1408,22 @@ class LoopyParser(ParserBase): return TypeAnnotation( typename, self.parse_expression(pstate, _PREC_UNARY)) + + elif pstate.is_next(_openbracket): + pstate.advance() + pstate.expect_not_end() + if pstate.is_next(_closebracket): + swept_inames = () + else: + swept_inames = self.parse_expression(pstate) + + pstate.expect(_closebracket) + pstate.advance() + pstate.expect(_colon) + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: return super(LoopyParser, self).parse_prefix(pstate) @@ -1440,6 +1676,7 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # {{{ simplify using aff +# FIXME: redundant with simplify_via_aff def simplify_using_aff(kernel, expr): inames = get_dependencies(expr) & kernel.all_inames() @@ -1458,15 +1695,34 @@ def simplify_using_aff(kernel, expr): # }}} -# {{{ expression/set <-> constraint conversion +# {{{ qpolynomial_to_expr + +def _term_to_expr(space, term): + from pymbolic.primitives import Variable + + result = term.get_coefficient_val().to_python() + for dt in isl._CHECK_DIM_TYPES: + for i in range(term.dim(dt)): + exp = term.get_exp(dt, i) + if exp: + result = result*Variable(space.get_dim_name(dt, i))**exp -def eq_constraint_from_expr(space, expr): - return isl.Constraint.equality_from_aff(aff_from_expr(space, expr)) + for i in range(term.dim(dim_type.div)): + raise NotImplementedError("divs in terms") + # FIXME print the qpoly, match the semantics + result += aff_to_expr(term.get_div(i)) + return result -def ineq_constraint_from_expr(space, expr): - return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr)) +def qpolynomial_to_expr(qpoly): + space = qpoly.space + return sum(_term_to_expr(space, t) for t in qpoly.get_terms()) + +# }}} + + +# {{{ expression/set <-> constraint conversion def constraint_to_cond_expr(cns): # Looks like this is ok after all--get_aff() performs some magic. @@ -1668,7 +1924,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, except ExpressionToAffineConversionError as err: shape_aff = None - if shape is not None: + if shape is not None and shape[idim] is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) except ExpressionToAffineConversionError: @@ -1780,6 +2036,10 @@ class BatchedAccessRangeMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + def map_sub_array_ref(self, expr, inames): + total_inames = inames | set([iname.name for iname in expr.swept_inames]) + return self.rec(expr.subscript, total_inames) + class AccessRangeMapper(object): """**IMPORTANT** diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 73d2a6328af87cb51fb90d43efcde34d39aa8299..20220d41838783389da955f78773ca18b67d04c6 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -81,7 +81,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, callables_table): pass # }}} @@ -151,7 +151,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 01d26dd822e46973ad7ebb99f18ec4519e0b4585..c8aa041da632b7d6896376761117b416dd56eade 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -62,11 +62,13 @@ class DTypeRegistryWrapper(object): return self.wrapped_registry.get_or_register_dtype(names, dtype) def dtype_to_ctype(self, dtype): - from loopy.types import LoopyType, NumpyType + from loopy.types import LoopyType, NumpyType, OpaqueType assert isinstance(dtype, LoopyType) if isinstance(dtype, NumpyType): return self.wrapped_registry.dtype_to_ctype(dtype) + elif isinstance(dtype, OpaqueType): + return dtype.name else: raise LoopyError( "unable to convert type '%s' to C" @@ -393,76 +395,131 @@ def c_symbol_mangler(kernel, name): # float NAN as defined in C99 standard if name == "NAN": return NumpyType(np.dtype(np.float32)), name + + if name in ["INT_MAX", "INT_MIN"]: + return NumpyType(np.dtype(np.int32)), name + return None # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None - - if name in ["abs", "min", "max"]: - name = "f" + name +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + name = self.name - dtype = arg_dtypes[0].numpy_dtype + if name in ["abs", "min", "max"]: + name = "f" + name - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - # binary functions - if (name in ["fmax", "fmin", "copysign"] - and len(arg_dtypes) == 2): + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + # binary functions + elif name in ["fmax", "fmin", "pow", "atan2", "copysign"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + #FIXME: Do we need to raise here?: + # The pattern we generally follow is that if we don't find + # a function, then we just return None + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: # pylint:disable=no-member + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", + "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc"]: + return CMathCallable(name=identifier) return None # }}} @@ -471,12 +528,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CFamilyASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CFamilyASTBuilder, self).symbol_manglers() + [ @@ -489,6 +540,12 @@ class CFamilyASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_id_in_knl_callable_mapper(self): + return ( + super(CFamilyASTBuilder, + self).function_id_in_knl_callable_mapper() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -575,9 +632,13 @@ class CFamilyASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" + if codegen_state.kernel.is_called_from_host: + name = Value("void", name) + else: + name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration( - Value("void", name), + name, [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info])) @@ -606,8 +667,8 @@ class CFamilyASTBuilder(ASTBuilderBase): temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( - temporaries_read_in_subkernel(kernel, subkernel) | - temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), @@ -890,82 +951,33 @@ class CFamilyASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.callables_table[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) + # takes "is_returned" to infer whether insn.assignees[0] is a part of + # LHS. + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) - - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 698507978f7c20d6d594fd3e03626e7b12012a94..b6525b5d183c803955317a92ed10d8206eecba65 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -379,7 +380,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -388,35 +389,35 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -425,14 +426,14 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, + codegen_result.implemented_data_info, all_code, self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) # }}} @@ -449,7 +450,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 1bf197c086659da9d5da9d4c7f6b463f04a1544f..c970901b1ce9161c713d83296f4ca7f05b8f433c 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -165,6 +166,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_tagged_variable(self, expr, type_context): return var(expr.name) + def map_sub_array_ref(self, expr, type_context): + return var("&")(self.rec(expr.get_begin_subscript(), + type_context)) + def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] @@ -437,103 +442,29 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function - - # {{{ implement indexof, indexof_vec - - if identifier.name in ["indexof", "indexof_vec"]: - if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) - arg, = expr.parameters - if not isinstance(arg, Subscript): - raise LoopyError( - "argument to %s must be a subscript" % identifier.name) - - ary = self.find_array(arg) - - from loopy.kernel.array import get_access_info - from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) - - from loopy.kernel.data import ImageArg - if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) - - if identifier.name == "indexof": - return access_info.subscripts[0] - elif identifier.name == "indexof_vec": - from loopy.kernel.array import VectorArrayDimTag - ivec = None - for iaxis, dim_tag in enumerate(ary.dim_tags): - if isinstance(dim_tag, VectorArrayDimTag): - ivec = iaxis - - if ivec is None: - return access_info.subscripts[0] - else: - return ( - access_info.subscripts[0]*ary.shape[ivec] - + access_info.vector_index) - - else: - raise RuntimeError("should not get here") - # }}} + identifier_name = ( + self.codegen_state.callables_table[expr.function.name].name) - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.codegen_state.callables_table[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = ( + self.codegen_state.callables_table[ + expr.function.name]) + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return ( + self.codegen_state.callables_table[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 50fd1026f7bd15ce72915d0d5d5e60f6da4e264c..d713e06c08b6a16962043103e9bde440011e5359 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, caller_kernel, + callables_table): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), + 0: dtype, 1: dtype}), + callables_table) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -181,6 +235,9 @@ class CudaTarget(CFamilyTarget): super(CudaTarget, self).__init__() + def split_kernel_at_global_barriers(self): + return True + def get_device_ast_builder(self): return CUDACASTBuilder(self) @@ -214,16 +271,49 @@ class CudaTarget(CFamilyTarget): # }}} +# {{{ preamable generator + +def cuda_preamble_generator(preamble_info): + from loopy.types import AtomicNumpyType + seen_64_bit_atomics = any( + isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 + for dtype in preamble_info.seen_atomic_dtypes) + + if seen_64_bit_atomics: + # Source: + # docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + yield ("00_enable_64bit_atomics", """ + #if __CUDA_ARCH__ < 600 + __device__ double atomicAdd(double* address, double val) + { + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + } while (assumed != old); + + return __longlong_as_double(old); + } + #endif + """) + +# }}} + + # {{{ ast builder class CUDACASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_id_in_knl_callable_mapper(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) # }}} @@ -249,7 +339,8 @@ class CUDACASTBuilder(CFamilyASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): @@ -262,6 +353,12 @@ class CUDACASTBuilder(CFamilyASTBuilder): return FunctionDeclarationWrapper(fdecl) + def preamble_generators(self): + + return ( + super(CUDACASTBuilder, self).preamble_generators() + [ + cuda_preamble_generator]) + # }}} # {{{ code generation guts @@ -339,6 +436,97 @@ class CUDACASTBuilder(CFamilyASTBuilder): return CudaConstant(arg_decl) + # {{{ code generation for atomic update + + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum + from cgen import Statement + from pymbolic.mapper.stringifier import PREC_NONE + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ + np.int32, np.int64, np.float32, np.float64]: + # atomicAdd + if isinstance(rhs_expr, Sum): + ecm = self.get_expression_to_code_mapper(codegen_state) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Statement("atomicAdd(&{0}, {1})".format( + lhs_expr_code, rhs_expr_code)) + else: + from cgen import Block, DoWhile, Assign + from loopy.target.c import POD + old_val_var = codegen_state.var_name_generator("loopy_old_val") + new_val_var = codegen_state.var_name_generator("loopy_new_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + }) + + lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) + + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + from loopy.symbolic import SubstitutionMapper + + subst = SubstitutionMapper( + make_subst_func({lhs_expr: var(old_val_var)})) + rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + cast_str = "" + old_val = old_val_var + new_val = new_val_var + + if lhs_dtype.numpy_dtype.kind == "f": + if lhs_dtype.numpy_dtype == np.float32: + ctype = "int" + elif lhs_dtype.numpy_dtype == np.float64: + ctype = "long" + else: + assert False + + old_val = "*(%s *) &" % ctype + old_val + new_val = "*(%s *) &" % ctype + new_val + cast_str = "(%s *) " % (ctype) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + new_val_var), + DoWhile( + "atomicCAS(" + "%(cast_str)s&(%(lhs_expr)s), " + "%(old_val)s, " + "%(new_val)s" + ") != %(old_val)s" + % { + "cast_str": cast_str, + "lhs_expr": lhs_expr_code, + "old_val": old_val, + "new_val": new_val, + }, + Block([ + Assign(old_val_var, lhs_expr_code), + Assign(new_val_var, rhs_expr_code), + ]) + ) + ]) + else: + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + + # }}} + # }}} # }}} diff --git a/loopy/target/execution.py b/loopy/target/execution.py index c8f0d40903b1638e853caf459c0c0393d754c993..9d1d14376e5adc6592bfee29f738c4e37f6a39f6 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,12 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program): # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program.args: if not isinstance(arg, ArrayBase): continue @@ -82,7 +82,8 @@ class SeparateArrayPackingController(object): name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program.root_kernel.get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -143,7 +144,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -168,7 +169,8 @@ class ExecutionWrapperGeneratorBase(object): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -214,9 +216,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, kernel, + def generate_integer_arg_finding_from_offsets(self, gen, program, implemented_data_info): - options = kernel.options + options = program.root_kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -239,7 +241,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -264,8 +266,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + self, gen, program, implemented_data_info): + options = program.root_kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -284,7 +286,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -307,8 +309,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: + self, gen, program, implemented_data_info): + if program.root_kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -361,7 +363,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen, program, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -384,8 +386,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in program.root_kernel.get_written_variables() + program_arg = program.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -447,7 +449,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, program_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -465,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -493,10 +495,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if kernel_arg.shape is None: + if program_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in program_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -519,8 +521,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and program_arg.dim_tags: + itemsize = program_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -558,7 +560,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." + "default_offset=loopy.auto to make_program()." "\")" % arg.name) gen("") @@ -617,7 +619,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -629,12 +631,12 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = kernel.options + options = program.root_kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % program.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -651,21 +653,21 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + program, implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program, implemented_data_info) if options.write_wrapper: output = gen.get() @@ -713,32 +715,32 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program.args + if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - kernel = self.kernel + program = self.program if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = kernel.impl_arg_to_arg[var].name + dest_name = program.impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -749,28 +751,30 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - kernel = add_dtypes(kernel, var_to_dtype) + program = add_dtypes(program, var_to_dtype) - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.root_kernel.schedule is None: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + program = program.with_root_kernel( + get_one_scheduled_kernel(program.root_kernel, + program.callables_table)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -778,9 +782,9 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -791,7 +795,7 @@ class KernelExecutorBase(object): if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) @@ -823,7 +827,7 @@ class KernelExecutorBase(object): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -831,7 +835,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index eb0157bf86d478901fb5a07bbac28aa7a11bcec9..812bf3a560b191bd61f5b86cb401983ad97467a2 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,8 +172,9 @@ class ISPCTarget(CFamilyTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, callables_table): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 04d436043daed74362ebabd96e18bf1d4d6d4a6c..6b7ef1b886d0620ef4fdbb9ccb1d208bba43f14f 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,134 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + callables_table) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -280,6 +355,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -365,13 +441,10 @@ class OpenCLTarget(CFamilyTarget): class OpenCLCASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( @@ -380,13 +453,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -399,6 +469,11 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.kernel.is_called_from_host: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize @@ -407,7 +482,8 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 826ba2a8f09b8a19d19200ef6d936a8276cf3688..c11c309614a78486635e330ed00aac46abc123fb 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -135,7 +135,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -152,7 +152,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -200,37 +201,89 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + callables_table) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -364,8 +417,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -762,19 +815,17 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_id_in_knl_callable_mapper(self): + from loopy.library.random123 import ( + random123_function_id_to_in_knl_callable_mapper) return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_id_to_in_knl_callable_mapper, + random123_function_id_to_in_knl_callable_mapper] + super( + PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} @@ -782,6 +833,70 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # }}} +class NvidiaPyOpenCLTarget(PyOpenCLTarget): + def __init__(self, device, pyopencl_module_name="_lpy_cl", + atomics_flavor=None): + import pyopencl as cl + assert isinstance(device, cl.Device) + assert device.vendor == 'NVIDIA Corporation' + + super(NvidiaPyOpenCLTarget, self).__init__(device, + pyopencl_module_name, atomics_flavor) + + def preprocess(self, kernel): + from loopy import set_options + if self.device.compute_capability_major_nv >= 6: + build_options = ['-cl-nv-arch', 'sm_60'] + ( + kernel.options.cl_build_options) + kernel = set_options(kernel, cl_build_options=build_options) + return super(NvidiaPyOpenCLTarget, self).preprocess(kernel) + + def get_device_ast_builder(self): + # here we should have an if else condition + if self.device.compute_capability_major_nv >= 6: + return NvidiaPyOpenCLCASTBuilder(self) + else: + return super(NvidiaPyOpenCLTarget, self).get_device_ast_builder() + + +class NvidiaPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum + from cgen import Statement, Block, Assign + from loopy.target.c import POD + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype == np.float64: + # atomicAdd + if isinstance(rhs_expr, Sum): + + old_val_var = codegen_state.var_name_generator("loopy_old_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + }) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + Assign(old_val_var, lhs_expr_code), + Statement('asm volatile("atom.global.add.f64 %0, [%1], %2;" :' + '"=d"({0}) : "l"(&{1}) , "d"({2}))'.format( + old_val_var, lhs_expr_code, rhs_expr_code))]) + + return super(NvidiaPyOpenCLCASTBuilder, + self).emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + + # {{{ volatile mem acccess target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 05fdd21f084412997ed1f0b258f6ba70b435bd4e..b7006575bb05561e29320f092935e7bb5dcab006 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): - if kernel.options.cl_exec_manage_array_events: + def generate_invocation(self, gen, program_name, args, + program, implemented_data_info): + if program.root_kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") - gen("_lpy_evt = {kernel_name}({args})" + gen("_lpy_evt = {program_name}({args})" .format( - kernel_name=kernel_name, + program_name=program_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args + ["wait_for=wait_for", "allocator=allocator"]))) - if kernel.options.cl_exec_manage_array_events: + if program.root_kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + and arg.base_name in ( + program.root_kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue - is_written = arg.base_name in kernel.get_written_variables() + is_written = arg.base_name in ( + program.root_kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -218,12 +220,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -252,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -261,41 +264,41 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__(program) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=( - kernel.target.with_device(context.devices[0]))) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=( + program.target.with_device(context.devices[0]))) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = dev_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -303,17 +306,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program.root_kernel.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -348,10 +351,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3d2a39dcf7126339055d32fa16ffcc25..1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -82,47 +83,37 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.callables_table[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.codegen_state.callables_table[ + expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +180,12 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, + self).function_id_in_knl_callable_mapper() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/tools.py b/loopy/tools.py index e16bac6b28d4800a6f15072885cf7366e4e40836..524638e4a9fb2c624c49a3da53fd3cbdeca907c8 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -38,7 +38,9 @@ from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) import six # noqa from six.moves import intern - +import re +from mako.template import Template +import loopy as lp if six.PY2: def is_integer(obj): @@ -48,6 +50,17 @@ else: return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): @@ -78,11 +91,17 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key)): + for dict_key in sorted(six.iterkeys(key), key=lambda obj: + type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) @@ -673,4 +692,123 @@ def is_interned(s): def intern_frozenset_of_ids(fs): return frozenset(intern(s) for s in fs) + +def natorder(key): + # Return natural ordering for strings, as opposed to dictionary order. + # E.g. will result in + # 'abc1' < 'abc9' < 'abc10' + # rather than + # 'abc1' < 'abc10' < 'abc9' + # Based on + # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 + import re + return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] + + +def natsorted(seq, key=lambda x: x): + return sorted(seq, key=lambda y: natorder(key(y))) + + +def dump_as_python(kernel, filename=None): + """ + Generates a python code for generating *kernel* for sharing kernels. + + :arg kernel: An instance of :class:`loopy.LoopKernel` + :arg filename: An instance of :class:`str`. If *None*, then prints the + python file to *stdout*. + """ + + options = [] + + printed_insn_ids = set() + printed_insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in printed_insn_ids: + return + printed_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + for insn in printed_insn_order: + option = 'id=%s, ' % insn.id + if insn.depends_on: + option += ("dep="+":".join(insn.depends_on)+", ") + if insn.tags: + option += ("tags="+":".join(insn.tags)+", ") + if insn.within_inames: + option += ("inames="+":".join(insn.within_inames)+", ") + if isinstance(insn, lp.MultiAssignmentBase): + if insn.atomicity: + option += "atomic, " + elif isinstance(insn, lp.BarrierInstruction): + option += ("mem_kind=%s, " % insn.mem_kind) + options.append(option[:-2]) + + insn_x_options = zip(printed_insn_order, options) + + python_code = r'''<%! import loopy as lp %>import loopy as lp + import numpy as np + <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL', + 2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %> + knl = lp.make_kernel( + [ + % for dom in domains: + "${str(dom)}", + % endfor + ], + """ + % for insn, opts in insn_x_opts: + % if isinstance(insn, lp.Assignment): + ${insn.assignee} = ${insn.expression} {${opts}} + % elif isinstance(insn, lp.BarrierInstruction): + ... ${insn.synchronization_kind[0]}barrier{${opts}} + % elif isinstance(insn, lp.NoOpInstruction): + ... nop {${opts}} + % else: + **Not implemented for ${type(insn)}** + % endif + %endfor + """, [ + % for arg in args: + % if isinstance(arg, lp.ValueArg): + lp.ValueArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}), + % else: + lp.GlobalArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}, + shape=${arg.shape}, for_atomic=${arg.for_atomic}), + % endif + % endfor + % for tv in temp_vars: + lp.TemporaryVariable( + name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name}, + shape=${tv.shape}, for_atomic=${tv.for_atomic}, + address_space=${tv_scope[tv.address_space]}, + read_only=${tv.read_only}, + % if tv.initializer is not None: + initializer=${"np."+str((tv.initializer).__repr__())}, + % endif + ), + % endfor + ], lang_version=${lp.VERSION})''' + + python_code = Template(python_code).render(insn_x_opts=insn_x_options, + domains=kernel.domains, args=kernel.args, + temp_vars=[k for k in kernel.temporary_variables.values()]) + + python_code = re.sub("\\n ", "\n", python_code) + if filename: + with open(filename, 'w') as f: + f.write(python_code) + else: + print(python_code) + + # vim: foldmethod=marker diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index a20a798cfa35c64c0cbd7097b41824dda2a35a84..f4a184f632d251bed7ec7d6ace718b3851c5c0d8 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,6 +26,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -36,8 +38,10 @@ __doc__ = """ # {{{ add_barrier -def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, - tags=None, synchronization_kind="global", mem_kind=None): +@iterate_over_kernels_if_given_program +def add_barrier(knl, insn_before="", insn_after="", + id_based_on=None, tags=None, synchronization_kind="global", + mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to *synchronization_kind* """ + assert isinstance(knl, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind @@ -76,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, mem_kind=mem_kind) new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(kernel=new_knl, + new_knl = add_dependency(new_knl, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index b7f47c38a6a0daf8e4495c16791ef2f955019649..3df86e7ae04073e654f91b30c584719c165269d0 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,9 +27,13 @@ import six from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -53,7 +57,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@iterate_over_kernels_if_given_program def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index f6568918d30f33d4c7103e40d02bdc40c38dfa1b..d5a97b773e7447833c96405920e1efad1b382baa 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,14 +25,19 @@ THE SOFTWARE. import six -from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) +from loopy.symbolic import (RuleAwareIdentityMapper, + SubstitutionRuleMappingContext, pw_aff_to_expr) from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program + + __doc__ = """ .. currentmodule:: loopy .. autofunction:: to_batched +.. autofunction:: save_temporaries_in_loop """ @@ -54,13 +59,15 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args): class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, - batch_iname_expr, sequential): + batch_iname_expr, sequential, batch_varying_temps=None, within=None): super(_BatchVariableChanger, self).__init__(rule_mapping_context) self.kernel = kernel self.batch_varying_args = batch_varying_args self.batch_iname_expr = batch_iname_expr self.sequential = sequential + self.batch_varying_temps = batch_varying_temps + self.within = within def needs_batch_subscript(self, name): tv = self.kernel.temporary_variables.get(name) @@ -70,14 +77,19 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): if not self.sequential: if tv is None: return False - if not temp_needs_batching_if_not_sequential(tv, - self.batch_varying_args): - return False + if self.batch_varying_temps: + return tv.name in self.batch_varying_temps + else: + if not temp_needs_batching_if_not_sequential(tv, + self.batch_varying_args): + return False return True def map_subscript(self, expr, expn_state): - if not self.needs_batch_subscript(expr.aggregate.name): + if not self.needs_batch_subscript(expr.aggregate.name) or not ( + self.within(expn_state.kernel, expn_state.instruction, + expn_state.stack)): return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) idx = self.rec(expr.index, expn_state) @@ -87,7 +99,9 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx) def map_variable(self, expr, expn_state): - if not self.needs_batch_subscript(expr.name): + if not self.needs_batch_subscript(expr.name) or not ( + self.within(expn_state.kernel, expn_state.instruction, + expn_state.stack)): return super(_BatchVariableChanger, self).map_variable(expr, expn_state) return expr[self.batch_iname_expr] @@ -102,8 +116,9 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", - sequential=False): +@iterate_over_kernels_if_given_program +def to_batched(knl, nbatches, batch_varying_args, + batch_iname_prefix="ibatch", sequential=False, within=None): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. @@ -180,22 +195,84 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", from loopy.kernel.data import ForceSequentialTag knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) + from loopy.match import parse_stack_match, parse_match + rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr, - sequential=sequential) + sequential=sequential, within=parse_stack_match(within)) kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) batch_iname_set = frozenset([batch_iname]) + within = parse_match(within) kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) - for insn in kernel.instructions]) + if within(kernel, insn) else insn for insn in kernel.instructions]) return kernel # }}} + +@iterate_over_kernels_if_given_program +def save_temporaries_in_loop(knl, iname, temps_to_save, within=None): + """ + Returns a kernel with the temporary variables in *temps_to_save* batched + within the iname *iname*. + + :arg iname: An instance of :class:`str1 for the loop across which the + values of the temporaries are to be saved. + + :arg temps_to_save: An iterable containing the temporaries that are to be + saved for each loop iteration defined by *iname*. + + :arg within: If not None, limit the action of the transformation to + matching contexts. See :func:`loopy.match.parse_stack_match` + for syntax. + """ + from loopy.match import parse_match, parse_stack_match + from pymbolic import var + from loopy.isl_helpers import static_max_of_pw_aff + + batch_iname_expr = var(iname) + + bounds = knl.get_iname_bounds(iname, constants_only=False) + nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size, + constants_only=False)) + + new_temps = {} + + for temp in six.itervalues(knl.temporary_variables): + if temp.name in temps_to_save: + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(temp.shape) + 1), + dim_names=_add_unique_dim_name("itemp_save", temp.dim_names)) + else: + new_temps[temp.name] = temp + + knl = knl.copy(temporary_variables=new_temps) + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator) + bvc = _BatchVariableChanger(rule_mapping_context, + knl, [], batch_iname_expr, + sequential=False, batch_varying_temps=temps_to_save, + within=parse_stack_match(within)) + kernel = rule_mapping_context.finish_kernel( + bvc.map_kernel(knl)) + + within = parse_match(within) + + batch_iname_set = frozenset([iname]) + kernel = kernel.copy( + instructions=[ + insn.copy(within_inames=insn.within_inames | batch_iname_set) + if within(kernel, insn) else insn for insn in kernel.instructions]) + + return kernel + # vim: foldmethod=marker diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 7f4779cc7c0af3fa228ca51a3f8d45944ec21bff..a1c90d791a9d4097398610badc421aa4600e2097 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import ScalarCallable, CallableKernel from pymbolic import var @@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, callables_table, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = buffer_array_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 0000000000000000000000000000000000000000..4798436973557b9f730f7b3382480b2a2e63a095 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,746 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + +from loopy.kernel import LoopKernel +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + Assignment, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper +from loopy.isl_helpers import simplify_via_aff +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.program import Program, ResolvedFunctionMarker +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_function_id_to_in_knl_callable_mapper + +.. autofunction:: register_callable_kernel +""" + + +# {{{ register function lookup + +def _resolved_callables_from_function_lookup(program, + func_id_to_in_kernel_callable_mapper): + """ + Returns a copy of *program* with the expression nodes marked "Resolved" + if any match is found through the given + *func_id_to_in_kernel_callable_mapper*. + + :arg func_id_to_in_kernel_callable_mapper: A function with signature + ``(target, identifier)`` that returns either an instance of + :class:`loopy.InKernelCallable` or *None*. + """ + callables_table = program.callables_table + + callable_knls = dict( + (func_id, in_knl_callable) for func_id, in_knl_callable in + callables_table.items() if isinstance(in_knl_callable, + CallableKernel)) + edited_callable_knls = {} + + for func_id, in_knl_callable in callable_knls.items(): + kernel = in_knl_callable.subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, callables_table, + [func_id_to_in_kernel_callable_mapper]) + + new_subkernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + callables_table = resolved_function_marker.callables_table + + edited_callable_knls[func_id] = in_knl_callable.copy( + subkernel=new_subkernel) + + new_resolved_functions = {} + + for func_id, in_knl_callable in callables_table.items(): + if func_id in edited_callable_knls: + new_resolved_functions[func_id] = edited_callable_knls[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + callables_table = callables_table.copy( + resolved_functions=new_resolved_functions) + + return program.copy(callables_table=callables_table) + + +def register_function_id_to_in_knl_callable_mapper(program, + func_id_to_in_knl_callable_mapper): + """ + Returns a copy of *program* with the *function_lookup* registered. + + :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, + identifier)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if + the *function_identifier* is not known. + """ + + # adding the function lookup to the set of function lookers in the kernel. + if func_id_to_in_knl_callable_mapper not in ( + program.func_id_to_in_knl_callable_mappers): + from loopy.tools import unpickles_equally + if not unpickles_equally(func_id_to_in_knl_callable_mapper): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % func_id_to_in_knl_callable_mapper) + new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( + [func_id_to_in_knl_callable_mapper]) + + program = _resolved_callables_from_function_lookup(program, + func_id_to_in_knl_callable_mapper) + + new_program = program.copy( + func_id_to_in_knl_callable_mappers=new_func_id_mappers) + + return new_program + +# }}} + + +# {{{ register_callable_kernel + +class _RegisterCalleeKernel(ImmutableRecord): + """ + Helper class to make the function scoper from + :func:`loopy.transform.register_callable_kernel` picklable. As python + cannot pickle lexical closures. + """ + fields = set(['callable_kernel']) + + def __init__(self, callable_kernel): + self.callable_kernel = callable_kernel + + def __call__(self, target, identifier): + if identifier == self.callable_kernel.subkernel.name: + return self.callable_kernel + return None + + +def register_callable_kernel(program, callee_kernel): + """Returns a copy of *caller_kernel*, which would resolve *function_name* in an + expression as a call to *callee_kernel*. + + :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg function_name: An instance of :class:`str`. + :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + """ + + # {{{ sanity checks + + assert isinstance(program, Program) + assert isinstance(callee_kernel, LoopKernel), ('{0} !=' + '{1}'.format(type(callee_kernel), LoopKernel)) + + # check to make sure that the variables with 'out' direction is equal to + # the number of assigness in the callee kernel intructions. + expected_num_assignees = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_written_variables()]) + expected_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + caller_kernel = in_knl_callable.subkernel + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == callee_kernel.name): + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + else: + kw_parameters = {} + if len(insn.assignees) != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' " + "direction " "in callee kernel %s and the number " + "of assignees in " "instruction %s do not " + "match." % ( + callee_kernel.name, insn.id)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of " + "parameters in instruction %s do not match." + % (callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + # }}} + + # take the function resolvers from the Program and resolve the functions in + # the callee kernel + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + callee_kernel.substitutions, + callee_kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, callee_kernel, program.callables_table, + program.func_id_to_in_knl_callable_mappers) + + callee_kernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(callee_kernel)) + callables_table = resolved_function_marker.callables_table.copy() + + program = program.copy(callables_table=callables_table) + + # making the target of the child kernel to be same as the target of parent + # kernel. + callable_kernel = CallableKernel(subkernel=callee_kernel.copy( + target=program.target, + is_called_from_host=False)) + + # FIXME disabling global barriers for callee kernel (for now) + from loopy import set_options + callee_kernel = set_options(callee_kernel, "disable_global_barriers") + + # FIXME: the number of callables is wrong. This is horrible please + # compensate. + + return register_function_id_to_in_knl_callable_mapper( + program, + _RegisterCalleeKernel(callable_kernel)) + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(caller_kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = caller_kernel.get_var_name_generator() + ing = caller_kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = caller_kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + for k, v in six.iteritems(arg_map): + if isinstance(v, SubArrayRef): + var_map[p.Variable(k)] = v.subscript.aggregate + else: + var_map[p.Variable(k)] = v + + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + + new_atomicity = tuple( + type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) + for atomicity in insn.atomicity) + + if isinstance(insn, Assignment): + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + atomicity=new_atomicity + ) + else: + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + +# }}} + + +# {{{ inline callable kernel + +def _inline_single_callable_kernel(caller_kernel, function_name, + callables_table): + old_insns = caller_kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? ~AK + if insn.expression.function.name in callables_table: + history_of_identifier = callables_table.history[ + insn.expression.function.name] + + if function_name in history_of_identifier: + in_knl_callable = callables_table[ + insn.expression.function.name] + assert isinstance(in_knl_callable, CallableKernel) + caller_kernel = _inline_call_instruction( + caller_kernel, in_knl_callable.subkernel, insn) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) + + return caller_kernel, callables_table + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(program, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + program = infer_arg_descr(program) + callables_table = program.callables_table + old_callables_table = callables_table.copy() + + edited_callable_kernels = {} + + for func_id, in_knl_callable in old_callables_table.items(): + if function_name not in old_callables_table.history[func_id] and ( + isinstance(in_knl_callable, CallableKernel)): + caller_kernel = in_knl_callable.subkernel + caller_kernel, callables_table = ( + _inline_single_callable_kernel(caller_kernel, + function_name, + callables_table)) + edited_callable_kernels[func_id] = in_knl_callable.copy( + subkernel=caller_kernel) + + new_resolved_functions = {} + for func_id, in_knl_callable in callables_table.items(): + if func_id in edited_callable_kernels: + new_resolved_functions[func_id] = edited_callable_kernels[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + callables_table = callables_table.copy( + resolved_functions=new_resolved_functions) + + return program.copy(callables_table=callables_table) + +# }}} + + +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) + +class DimChanger(IdentityMapper): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): + self.callee_arg_dict = callee_arg_dict + self.desired_shape = desired_shape + + def map_subscript(self, expr): + if expr.aggregate.name not in self.callee_arg_dict: + return super(DimChanger, self).map_subscript(expr) + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, callee_knl): + """ + :returns: a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimensions required by *caller_knl*. + """ + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name != + callee_knl.name): + # Call to a callable kernel can only occur through a + # CallInstruction. + continue + + def _shape_1_if_empty(shape): + assert isinstance(shape, tuple) + if shape == (): + return (1, ) + else: + return shape + + from loopy.kernel.function_interface import ( + ArrayArgDescriptor, get_arg_descriptor_for_expression, + get_kw_pos_association) + _, pos_to_kw = get_kw_pos_association(callee_knl) + arg_id_to_shape = {} + for arg_id, arg in six.iteritems(insn.arg_id_to_val()): + arg_id = pos_to_kw[arg_id] + + arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) + if isinstance(arg_descr, ArrayArgDescriptor): + arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr.shape) + else: + arg_id_to_shape[arg_id] = (1, ) + + dim_changer = DimChanger( + callee_knl.arg_dict, + arg_id_to_shape) + + new_callee_insns = [] + for callee_insn in callee_knl.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions + new_callee_knl = callee_knl.copy(instructions=new_callee_insns) + + return new_callee_knl + + +class _FunctionCalledChecker(CombineMapper): + def __init__(self, func_name): + self.func_name = func_name + + def combine(self, values): + return any(values) + + def map_call(self, expr): + if expr.function.name == self.func_name: + return True + return self.combine( + tuple( + self.rec(child) for child in expr.parameters) + ) + + map_call_with_kwargs = map_call + + def map_constant(self, expr): + return False + + def map_algebraic_leaf(self, expr): + return False + + def map_kernel(self, kernel): + return any(self.rec(insn.expression) for insn in kernel.instructions if + isinstance(insn, MultiAssignmentBase)) + + +def _match_caller_callee_argument_dimension_(program, callee_function_name): + """ + Returns a copy of *program* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *program* aligned with the argument + dimensions required by *caller_knl*. + + .. note:: + + The callee kernel addressed by *callee_funciton_name*, should be + called only once. + """ + assert isinstance(program, Program) + assert isinstance(callee_function_name, str) + + is_invoking_callee = _FunctionCalledChecker( + callee_function_name).map_kernel + + caller_knl, = [in_knl_callable.subkernel for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel) and + is_invoking_callee(in_knl_callable.subkernel)] + + old_callee_knl = program.callables_table[ + callee_function_name].subkernel + new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, old_callee_knl) + + new_callables_table = program.callables_table.copy() + new_callables_table.resolved_functions[callee_function_name] = ( + new_callables_table[callee_function_name].copy( + subkernel=new_callee_kernel)) + return program.copy(callables_table=new_callables_table) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..2c9499d9d92d73eeea1ce5344ca8475e60dedbd0 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -140,7 +143,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -328,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, callables_table, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # }}} @@ -385,6 +415,7 @@ def change_arg_to_image(knl, name): # {{{ tag array axes +@iterate_over_kernels_if_given_program def tag_array_axes(knl, ary_names, dim_tags): """ :arg dim_tags: a tuple of @@ -423,13 +454,15 @@ def tag_array_axes(knl, ary_names, dim_tags): return knl -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@iterate_over_kernels_if_given_program def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -454,13 +487,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(knl): new_args = [] @@ -502,6 +537,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries +@iterate_over_kernels_if_given_program def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -586,11 +622,14 @@ def alias_temporaries(knl, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -619,6 +658,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@iterate_over_kernels_if_given_program def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -664,6 +704,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@iterate_over_kernels_if_given_program def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -705,6 +746,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 21e61075596bc2b795434716ba8a4347f5cfb173..33bd519b2d84bf9d64a65214897e9084375dd6a4 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -379,6 +380,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(knl, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 70ad2406aabdd63ee21c448aac1091999247925e..ac0c5c83eb6729422c58fe52bba56b6a193ad494 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -31,6 +31,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.program import rename_resolved_functions_in_a_single_kernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -287,50 +291,8 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_kernels(kernels, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ +def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -411,4 +373,101 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result + +def fuse_kernels(programs, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + + # all the resolved functions in programs must be registered in + # main_callables_table + main_prog_callables_info = ( + programs[0].callables_table) + old_root_kernel_callable = ( + programs[0].callables_table[programs[0].name]) + kernels = [programs[0].root_kernel] + + # removing the callable collisions that maybe present + for prog in programs[1:]: + root_kernel = prog.root_kernel + renames_needed = {} + for old_func_id, in_knl_callable in prog.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + # Fusing programs with multiple callable kernels is tough. + # Reason: Need to first figure out the order in which the + # callable kernels must be resolved into + # main_callables_table, because of renaming is + # needed to be done in the callable kernels before registering. + # Hence disabling it until required. + if in_knl_callable.subkernel.name != prog.name: + raise LoopyError("fuse_kernels cannot fuse programs with " + "multiple callable kernels.") + + # root kernel are dealt at the end after performing all the + # renaming. + continue + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_added_callable(var(old_func_id), + in_knl_callable)) + + if old_func_id != new_func_id: + renames_needed[old_func_id] = new_func_id + + if renames_needed: + root_kernel = rename_resolved_functions_in_a_single_kernel( + root_kernel, renames_needed) + + kernels.append(root_kernel) + + new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) + new_root_kernel_callable = old_root_kernel_callable.copy( + subkernel=new_root_kernel.copy(name=programs[0].name)) + + # TODO: change the name of the final root kernel. + main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( + var(programs[0].name), new_root_kernel_callable) + + return programs[0].copy( + callables_table=main_prog_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 96c8252ef7e6622250e9006b2275ef7816700b5c..584aca6a4f6914e61ff50b593c500e342bf495fd 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,6 +34,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -91,6 +95,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -105,6 +110,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -314,13 +321,15 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) + return tag_inames(kernel, {outer_iname: outer_tag, + inner_iname: inner_tag}) # }}} # {{{ split iname +@iterate_over_kernels_if_given_program def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -346,6 +355,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -362,6 +373,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@iterate_over_kernels_if_given_program def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -496,6 +508,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super(_InameJoiner, self).map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last @@ -503,6 +516,22 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): :func:`loopy.match.parse_stack_match`. """ + from loopy.match import parse_match + within = parse_match(within) + + # {{{ return the same kernel if no kernel matches + + def _do_not_transform_if_no_within_matches(): + for insn in kernel.instructions: + if within(kernel, insn): + return + + return kernel + + _do_not_transform_if_no_within_matches() + + # }}} + # now fastest varying first inames = inames[::-1] @@ -581,8 +610,8 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): new_insns = [ insn.copy( - within_inames=subst_within_inames(insn.within_inames)) - for insn in kernel.instructions] + within_inames=subst_within_inames(insn.within_inames)) if + within(kernel, insn) else insn for insn in kernel.instructions] kernel = (kernel .copy( @@ -607,7 +636,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) - return kernel + return remove_unused_inames(kernel, inames) # }}} @@ -640,7 +669,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -824,7 +855,9 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, +@iterate_over_kernels_if_given_program +def duplicate_inames(knl, inames, within, new_inames=None, + suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -986,7 +1019,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1052,7 +1085,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1080,18 +1113,42 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + for option in get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into): + yield option + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1302,6 +1359,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1321,6 +1379,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1344,6 +1403,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1675,6 +1735,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1721,6 +1782,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(knl, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..f73110ecdff79d7c029c0dd0d895ef71ea68326b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -25,15 +25,35 @@ THE SOFTWARE. import six # noqa from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -58,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -75,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -92,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -209,6 +232,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -228,6 +252,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -260,18 +285,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -324,6 +352,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py new file mode 100644 index 0000000000000000000000000000000000000000..ab91fdf78068fda7ee27cf8543419c22ae723024 --- /dev/null +++ b/loopy/transform/make_scalar.py @@ -0,0 +1,51 @@ +from pymbolic.primitives import Variable +from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) +from loopy.kernel.data import ValueArg +from loopy.transform.iname import remove_unused_inames + + +class ScalarChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, var_name): + self.var_name = var_name + super(ScalarChanger, self).__init__(rule_mapping_context) + + def map_subscript(self, expr, expn_state): + if expr.aggregate.name == self.var_name: + return Variable(self.var_name) + + return super(ScalarChanger, self).map_subscript(expr, expn_state) + + +def make_scalar(kernel, var_name): + rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, + kernel.get_var_name_generator()) + + kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel) + + new_args = [ValueArg(arg.name, arg.dtype, target=arg.target, + is_output_only=arg.is_output_only) if arg.name == var_name else arg for + arg in kernel.args] + new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) + if tv.name == var_name else (tv.name, tv) for tv in + kernel.temporary_variables.values()) + + return kernel.copy(args=new_args, temporary_variables=new_temps) + + +def remove_invariant_inames(kernel): + inames_used = set() + untagged_inames = ( + kernel.all_inames() - frozenset(kernel.iname_to_tags.keys())) + for insn in kernel.instructions: + for iname in ((insn.read_dependency_names() + | insn.write_dependency_names()) + & untagged_inames): + inames_used.add(iname) + + removable_inames = untagged_inames - inames_used + + new_insns = [insn.copy(within_inames=insn.within_inames-removable_inames) + for insn in kernel.instructions] + + return remove_unused_inames(kernel.copy(instructions=new_insns), + removable_inames) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 0000000000000000000000000000000000000000..a18326187379cac0b4be46bbfe244bcc2d9e7684 --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,345 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +def pack_and_unpack_args_for_call_for_single_kernel(kernel, + callables_table, call_name, args_to_pack=None, + args_to_unpack=None): + """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + must be packed. If set *None*, it is interpreted that all the array + arguments would be packed. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` + which must be unpacked. If set *None*, it is interpreted that + all the array arguments should be unpacked. + """ + assert isinstance(kernel, LoopKernel) + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + old_insn_to_new_insns = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + if insn.expression.function.name not in callables_table: + continue + + in_knl_callable = callables_table[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args_to_pack is None: + args_to_pack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + if args_to_unpack is None: + args_to_unpack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + + # {{{ sanity checks for args + + assert isinstance(args_to_pack, list) + assert isinstance(args_to_unpack, list) + + for arg in args_to_pack: + found_sub_array_ref = False + + for par in parameters + insn.assignees: + # checking that the given args is a sub array ref + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + for arg in args_to_unpack: + if arg not in args_to_pack: + raise LoopyError("Argument %s should be packed in order to be " + "unpacked." % arg) + + # }}} + + packing_insns = [] + unpacking_insns = [] + + # {{{ handling ilp tags + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in insn.within_inames + if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) + for tag in kernel.iname_to_tags.get(iname, []))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + # }}} + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + # dict to store the new assignees and parameters, the mapping pattern + # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} + + for arg_id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in + args_to_pack): + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + + new_pack_inames = dict((iname, var(vng(iname.name + + "_pack"))) for iname in p.swept_inames) + new_unpack_inames = dict((iname, var(vng(iname.name + + "_unpack"))) for iname in p.swept_inames) + + # Updating the domains corresponding to the new inames. + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=arg_in_caller.dtype, + dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[arg_id].shape, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) + + # {{{ getting the lhs for packing and rhs for unpacking + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) + + # }}} + + packing_insns.append(Assignment( + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_pack_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack"), + depends_on_is_final=True + )) + + if p.subscript.aggregate.name in args_to_unpack: + unpacking_insns.append(Assignment( + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_unpack_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), + depends_on_is_final=True + )) + + # {{{ creating the sweep inames for the new sub array refs + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[arg_id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[arg_id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[arg_id] = SubArrayRef( + tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[arg_id] = p + + if packing_insns: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_call_insn = insn.with_transformed_expressions(subst_mapper) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) + new_call_insn = new_call_insn.copy( + depends_on=new_call_insn.depends_on | set( + pack.id for pack in packing_insns), + within_inames=new_call_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_call_insn.expression.function(*new_params), + assignees=new_assignees) + old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] + + unpacking_insns) + + if old_insn_to_new_insns: + new_instructions = [] + for insn in kernel.instructions: + if insn.id in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn.id]) + else: + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set(old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set(old_insn_to_new_insns): + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn_id]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + + +def pack_and_unpack_args_for_call(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + +# vim: foldmethod=marker diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index d695e359558894c201f67e4013b25f5f45c19d82..2ee3bd9b153907b564f4ca25c4c3720a6910d509 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,6 +28,9 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -44,7 +47,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -237,7 +242,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy import split_iname + from loopy.transform.iname import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, @@ -246,7 +251,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, return kernel -split_arg_axis = MovedFunctionDeprecationWrapper(split_array_dim) +split_arg_axis = (MovedFunctionDeprecationWrapper(split_array_dim)) # }}} @@ -370,7 +375,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -387,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): There was a more complicated, dumber function called :func:`split_array_dim` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -439,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index fc5dad91dd73245c328f06e2c452b1d3d3a1da2b..5c5e94028e5dfaa87f802f88bae715cfe733d6af 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -40,6 +43,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -67,7 +71,7 @@ def assume(kernel, assumptions): # {{{ fix_parameter -def _fix_parameter(kernel, name, value): +def _fix_parameter(kernel, name, value, remove_argument): def process_set(s): var_dict = s.get_var_dict() @@ -103,7 +107,7 @@ def _fix_parameter(kernel, name, value): from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: - if arg.name == name: + if arg.name == name and remove_argument: # remove from argument list continue @@ -134,6 +138,7 @@ def _fix_parameter(kernel, name, value): )) +@iterate_over_kernels_if_given_program def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. @@ -141,9 +146,17 @@ def fix_parameters(kernel, **value_dict): to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ + assert isinstance(kernel, LoopKernel) + + # FIXME: Parameter / argument terminology? + + # FIXME: Is _remove the right approach? (I'm not sure it is.) Because of + # the potential namespace conflict. If yes, document. If no, fix. + + remove_arg = value_dict.pop("_remove", True) for name, value in six.iteritems(value_dict): - kernel = _fix_parameter(kernel, name, value) + kernel = _fix_parameter(kernel, name, value, remove_arg) return kernel diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 9f426f76bc6902fd09bd7685c73f187df935be1e..8837cc3d574d751ac9bf5af4dc04250e6ef87d33 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -38,6 +38,9 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -258,9 +261,9 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, callables_table, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -1033,15 +1036,40 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = precompute_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index baa558a72861f31c5ce707329ea84786b96eb6d2..c8e9a11a052817456b77af8f0722e802b2d180fd 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -64,7 +64,7 @@ class LivenessAnalysis(object): def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -235,8 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, callables_table): self.kernel = kernel + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -439,7 +440,8 @@ class TemporarySaver(object): return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -628,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -722,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(knl): +def save_and_reload_temporaries(program): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl): :returns: The resulting kernel """ + + knl = program.root_kernel + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program.root_kernel, + program.callables_table) + + assert knl.schedule is not None + liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) @@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_root_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 3eee3d8f3093ce68670ab2c119f41bc385afde01..725e6792055b6ea7dc0b8663204a264540f79fd3 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -31,6 +31,8 @@ from loopy.diagnostic import LoopyError from pytools import ImmutableRecord from pymbolic import var +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -42,6 +44,7 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst +@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -169,6 +172,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): instructions=new_insns, substitutions=new_substs) + # }}} @@ -256,6 +260,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) @@ -439,6 +444,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@iterate_over_kernels_if_given_program def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions @@ -447,6 +453,7 @@ def expand_subst(kernel, within=None): :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + if not kernel.substitutions: return kernel @@ -479,8 +486,17 @@ def find_rules_matching(knl, pattern): return [r for r in knl.substitutions if pattern.match(r)] -def find_one_rule_matching(knl, pattern): - rules = find_rules_matching(knl, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..281dcb43dd3609322fbda5bfc7af4f37125872fa 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -33,6 +33,14 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.program import CallablesTable +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext, SubArrayRef) +from pymbolic.primitives import Variable, Subscript, Lookup import logging logger = logging.getLogger(__name__) @@ -44,10 +52,152 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, callables_table, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -56,10 +206,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(callables_table, CallablesTable) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.callables_table = callables_table + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -92,13 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, callables_table=None): + if callables_table is None: + callables_table = self.callables_table + return type(self)(self.kernel, callables_table, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.callables_table, new_ass) @staticmethod def combine(dtype_sets): @@ -250,15 +406,19 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + + from pymbolic.primitives import Variable, CallWithKwargs, Call + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +426,148 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.callables_table[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + in_knl_callable, self.callables_table = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.callables_table)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.callables_table, new_function_id = ( + self.callables_table.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + in_knl_callable = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + self.callables_table, new_function_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = new_function_id + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + return [mangle_result.result_dtypes[0]] + # }}} - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -352,11 +635,20 @@ class TypeInferenceMapper(CombineMapper): def map_comparison(self, expr): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. + self(expr.left, return_tuple=False, return_dtype_set=False) + self(expr.right, return_tuple=False, return_dtype_set=False) + return [NumpyType(np.dtype(np.int32))] + + def map_logical_not(self, expr): + return [NumpyType(np.dtype(np.int32))] + + def map_logical_and(self, expr): + for child in expr.children: + self.rec(child) + return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + map_logical_or = map_logical_and def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] @@ -399,14 +691,20 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + + # }}} # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.callables_table) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +749,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.callables_table) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.callables_table) # }}} @@ -482,7 +784,8 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, callables_table, + expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +847,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, callables_table, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +857,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -575,10 +881,15 @@ def infer_unknown_types(kernel, expect_completion=False): item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) - - result, symbols_with_unavailable_types = ( - _infer_var_type( - kernel, item.name, type_inf_mapper, subst_expander)) + try: + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, callables_table) = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + except DependencyTypeInferenceFailure: + result = tuple() + type_inf_mapper = type_inf_mapper.copy( + callables_table=callables_table) failed = not result if not failed: @@ -597,6 +908,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -635,23 +947,132 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, + return_tuple=len(insn.assignees) > 1, + return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + callables_table = type_inf_mapper.callables_table + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + # deprecated. + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_resolved + check_functions_are_resolved(type_specialized_kernel) + + return type_specialized_kernel, callables_table + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + + callables_table = program.callables_table + + type_uninferred_knl_callable = ( + callables_table[program.name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + old_callables_count = callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel, callables_table = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + callables_table, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + callables_table, _ = ( + callables_table.with_callable( + program.name, + type_inferred_knl_callable)) + + callables_table = ( + callables_table.with_exit_edit_callables_mode( + old_callables_count)) + + return program.copy(callables_table=callables_table) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -682,7 +1103,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.callables_table) # }}} diff --git a/loopy/types.py b/loopy/types.py index 8f0f310c305b3d5b24bd6e771b501bb6d9c69224..4e77317c105a1f8b6acb61029ae6d81533d60372 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -177,6 +177,45 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} +# {{{ + +class OpaqueType(LoopyType): + """An opaque data type is truly opaque - it has no allocations, no + temporaries of that type, etc. The only thing allowed is to be pass in + through one ValueArg and go out to another. It is introduced to accomodate + functional calls to external libraries. + """ + def __init__(self, name): + assert isinstance(name, str) + self.name = name + self.target = None + + def is_integral(self): + return False + + def is_complex(self): + return False + + def involves_complex(self): + return False + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.name) + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.name == other.name) + + def __ne__(self, other): + return not self.__eq__(other) + +# }}} + + def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto diff --git a/test/test_apps.py b/test/test_apps.py index e07262dbdda8ad3c24522f7d0eb4dba8422bf0ce..c1ff4b893c459f9860fca3fbda8d06406676b8b5 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -662,7 +663,7 @@ def test_domain_tree_nesting(): lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl.root_kernel.parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index b0ca7ade25d3077c7f868f366cb9ff6bb011af33..b1f335bbb7fdcd9cf1e53603d5b70d1a224ee140 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -76,6 +76,7 @@ def test_c_target_strides(): # test with C-order knl = __get_kernel('C') + lp.generate_code_v2(knl) a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), order='C') diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f3acbd67dc792e6f5c1aa7cd2581896b3ca024 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,591 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_function_id_to_in_knl_callable_mapper(prog, + register_log2_lookup) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_function( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, name='linear_combo1') + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """, name='linear_combo2') + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + knl = lp.register_callable_kernel( + knl, grandchild_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_function( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg('f, e, h, g'), '...'], + name='linear_combo') + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 5 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_function( + "{[i, j]:0<=i, j < 32}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name='linear_combo') + + callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + + knl = lp.set_options(knl, 'return_dict') + + gsize, lsize = knl.get_grid_size_upper_bounds_as_exprs() + + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, out = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert gsize == (16, 4) + assert lsize == (2, 8) + assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_function( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """, name="callee_fn1") + + callee2 = lp.make_function( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """, name="callee_fn2") + + callee3 = lp.make_function( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """, name="callee_fn3") + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, callee1) + knl = lp.register_callable_kernel(knl, callee2) + knl = lp.register_callable_kernel(knl, callee3) + + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i")[0] + i = p.Variable("i") + index = p.Variable("index")[0] + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_function( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")], + name="custom_argmin") + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_diff.py b/test/test_diff.py index b735ab17a716c84bfa52df7f73476b4c575cda0f..d001233c0eced5ecaf9342b90da0487faefb21f3 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + knl = lp.make_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) @@ -66,6 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.make_program(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index ebfde850907d68bebf06076fbf1c87d8bb093f71..dd789d2cd8152413c815fddbbca62b94797623bf 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -61,20 +61,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) - assert knl.parents_per_domain()[1] == 0 + assert knl.root_kernel.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl.root_kernel.parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j 1e-15 + assert abs_err < 1e-6 + + def test_fill(ctx_factory): fortran_src = """ subroutine fill(out, a, n) @@ -60,18 +151,18 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = [fill] + ! RESULT = fill ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src, + knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") - assert "i_inner" in knl.all_inames() + assert "i_inner" in knl.root_kernel.all_inames() ctx = ctx_factory() @@ -92,7 +183,7 @@ def test_fill_const(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -115,7 +206,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -139,7 +230,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -166,7 +257,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -194,15 +285,15 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl - assert "a" in knl.temporary_variables + assert "a" in knl.root_kernel.temporary_variables knl = lp.assignment_to_subst(knl, "a") - assert "a" not in knl.temporary_variables + assert "a" not in knl.root_kernel.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -231,7 +322,7 @@ def test_if(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -265,7 +356,7 @@ def test_tagged(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -299,9 +390,9 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl.root_kernel.domains) == 1 ref_knl = knl @@ -360,7 +451,7 @@ def test_batched_sparse(): """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -404,18 +495,18 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv, = lp.parse_fortran( + xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv, = lp.parse_fortran( + yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv, = lp.parse_fortran( + xyderiv = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) knl = lp.prioritize_loops(knl, "e,i,j,k") - assert len(knl.temporary_variables) == 2 + assert len(knl.root_kernel.temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) @@ -447,15 +538,17 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! fill, twice = lp.parse_fortran(SOURCE) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = [knl] + ! RESULT = knl ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src) + lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -475,9 +568,9 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl.root_kernel.domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") @@ -503,6 +596,53 @@ def test_precompute_some_exist(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) +def test_fortran_subroutines(): + fortran_src = """ + subroutine twice(n, a) + implicit none + real*8 a(n) + integer i,n + + do i=1,n + a(i) = a(i) * 2 + end do + end subroutine + + subroutine twice_cross(n, a, i) + implicit none + integer i, n + real*8 a(n,n) + + call twice(n, a(1:n, i)) + call twice(n, a(i, 1:n)) + end subroutine + """ + prg = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(prg).device_code()) + + +def test_domain_fusion_imperfectly_nested(): + fortran_src = """ + subroutine imperfect(n, m, a, b) + implicit none + integer i, j, n, m + real a(n), b(n,n) + + do i=1, n + a(i) = i + do j=1, m + b(i,j) = i*j + end do + end do + end subroutine + """ + + prg = lp.parse_fortran(fortran_src) + # If n > 0 and m == 0, a single domain would be empty, + # leading (incorrectly) to no assignments to 'a'. + assert len(prg["imperfect"].domains) > 1 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_isl.py b/test/test_isl.py index 12e7c767082005f34ef7112ce26f3d2c308fd390..ff58a1bb315d992051018ca38992820156393192 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -51,6 +51,23 @@ def test_pw_aff_to_conditional_expr(): assert str(expr) == "0 if i == 0 else -1 + i" +def test_subst_into_pwqpolynomial(): + from pymbolic.primitives import Variable + arg_dict = { + 'm': 3*Variable("nx"), + 'n': 3*Variable("ny"), + 'nx': Variable('nx'), + 'ny': Variable('ny'), + 'nz': Variable('nz')} + space = isl.Set("[nx, ny, nz] -> { []: }").space + poly = isl.PwQPolynomial("[m, n] -> { (256 * m + 256 * m * n) : " + "m > 0 and n > 0; 256 * m : m > 0 and n <= 0 }") + + from loopy.isl_helpers import subst_into_pwqpolynomial + result = subst_into_pwqpolynomial(space, poly, arg_dict) + assert "(768 * nx + 2304 * nx * ny)" in str(result) + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_loopy.py b/test/test_loopy.py index 6b78ac26b78d8c85dab3cd41af0ce1d99d52ec07..3c985640bae6cdb07939e1a3a752b642f6dac2e6 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -97,7 +97,7 @@ def test_complicated_subst(ctx_factory): print(knl) - sr_keys = list(knl.substitutions.keys()) + sr_keys = list(knl.root_kernel.substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), @@ -107,8 +107,10 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many -def test_type_inference_no_artificial_doubles(): - knl = lp.make_kernel( +def test_type_inference_no_artificial_doubles(ctx_factory): + ctx = ctx_factory() + + prog = lp.make_kernel( "{[i]: 0<=i bb = a[i] - b[i] @@ -120,16 +122,15 @@ def test_type_inference_no_artificial_doubles(): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -141,13 +142,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -181,16 +186,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -201,17 +202,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -223,17 +221,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -245,17 +240,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_bare_data_dependency(ctx_factory): @@ -285,7 +277,9 @@ def test_bare_data_dependency(ctx_factory): # {{{ test race detection -def test_ilp_write_race_detection_global(): +def test_ilp_write_race_detection_global(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) -def test_ilp_write_race_avoidance_private(): +def test_ilp_write_race_avoidance_private(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} @@ -359,11 +358,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -384,10 +384,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -400,10 +401,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -418,16 +420,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -443,11 +445,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -464,10 +466,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -499,9 +502,9 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - #print(cknl.get_highlighted_code({"a": a.dtype})) - knl = lp.set_options(knl, write_cl=True) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + print(lp.generate_code_v2(knl)) knl(queue, a=a, b=b) import numpy.linalg as la @@ -519,18 +522,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -551,18 +552,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -605,8 +607,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): None, shape=None), "..."]) - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog.root_kernel.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -620,14 +622,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -775,11 +775,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -859,9 +855,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @@ -985,7 +979,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2278,10 +2263,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2291,7 +2277,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel('{[i]: 0 <= i < 10}', + prog = lp.make_kernel('{[i]: 0 <= i < 10}', """ for i a[i] = i {id=a} @@ -2306,15 +2292,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0') from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog.root_kernel knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_root_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): @@ -2323,7 +2311,7 @@ def test_multi_argument_reduction_type_inference(): from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog.root_kernel.get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2485,7 +2475,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == set(["n"]) + assert knl.root_kernel.all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2504,7 +2494,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2517,11 +2507,15 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"]) - assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"]) - assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"]) - assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"]) + assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() + assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + set(["insn2"])) + assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + set(["insn3"])) + assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + "insn2"])) + assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + set(["insn1", "insn5"])) def test_preamble_with_separate_temporaries(ctx_factory): @@ -2615,7 +2609,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=n {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ @@ -948,10 +1007,25 @@ def test_barrier_counter_barriers(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - barrier_count = sync_map["barrier_local"].eval_with_dict(params) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 +def test_barrier_count_single(): + knl = lp.make_kernel( + "{[i]: 0<=i<128}", + """ + <> c[i] = 15*i {id=yoink} + c[i+1] = c[i] {dep=yoink} + """) + + knl = lp.tag_inames(knl, {"i": "l.0"}) + sync_map = lp.get_synchronization_map(knl) + print(sync_map) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum() + assert barrier_count == 1 + + def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( @@ -978,21 +1052,21 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 - assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize + assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 + assert sync_map.filter_by(kind="barrier_local").eval_and_sum(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.SUBGROUP) + lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.SUBGROUP) + lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP) + lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) + lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1005,13 +1079,15 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', - variable='a', count_granularity=CG.WORKITEM) + variable='a', count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -1021,7 +1097,8 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell @@ -1040,14 +1117,16 @@ def test_all_counters_parallel_matmul(): lid_strides={1: 16}, gid_strides={}, variable='a_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1096,9 +1175,8 @@ def test_floor_div_coefficient_collector(): n_subgroups = n_workgroups*subgroups_per_group # count local f32 accesses - f32_local = lp.get_mem_access_map( - knl, count_redundant_work=True, subgroup_size=SGS - ).filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) + m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) + f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert f32_local == 2*(rept+1)*n_subgroups @@ -1136,7 +1214,8 @@ def test_mem_access_tagged_variables(): gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={1: Variable('m')}, @@ -1144,7 +1223,8 @@ def test_mem_access_tagged_variables(): direction='load', variable='a', variable_tag='mmaload', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell @@ -1157,7 +1237,8 @@ def test_mem_access_tagged_variables(): gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', variable_tag='mmresult', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell @@ -1176,7 +1257,7 @@ def test_gather_access_footprint(): fp = gather_access_footprints(knl) for key, footprint in six.iteritems(fp): - print(key, count(knl, footprint)) + print(key, count(knl.root_kernel, footprint)) def test_gather_access_footprint_2(): @@ -1191,8 +1272,8 @@ def test_gather_access_footprint_2(): params = {"n": 200} for key, footprint in six.iteritems(fp): - assert count(knl, footprint).eval_with_dict(params) == 200 - print(key, count(knl, footprint)) + assert count(knl.root_kernel, footprint).eval_with_dict(params) == 200 + print(key, count(knl.root_kernel, footprint)) def test_summations_and_filters(): @@ -1316,8 +1397,8 @@ def test_strided_footprint(): x_l_foot = footprints[('x', 'read')] from loopy.statistics import count - num = count(knl, x_l_foot).eval_with_dict(param_dict) - denom = count(knl, x_l_foot.remove_divs()).eval_with_dict(param_dict) + num = count(knl.root_kernel, x_l_foot).eval_with_dict(param_dict) + denom = count(knl.root_kernel, x_l_foot.remove_divs()).eval_with_dict(param_dict) assert 2*num < denom diff --git a/test/test_target.py b/test/test_target.py index bcf85a340a29afc8772686d23c5fe3e8a03ccffd..0d34310664e027f8e7ee133da871c91723295d10 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -72,9 +72,7 @@ def test_ispc_target(occa_mode=False): knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) + codegen_result = lp.generate_code_v2(knl) print(codegen_result.device_code()) print(codegen_result.host_code()) @@ -98,9 +96,8 @@ def test_cuda_target(): default_tag="l.auto") print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) + lp.generate_code_v2( + knl).device_code()) def test_generate_c_snippet(): @@ -140,10 +137,7 @@ def test_generate_c_snippet(): knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") - - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) + print(lp.generate_code_v2(knl)) @pytest.mark.parametrize("target", [CTarget, OpenCLTarget]) @@ -353,8 +347,6 @@ def test_ispc_streaming_stores(): knl = lp.set_argument_order(knl, vars + ["n"]) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code() @@ -375,6 +367,32 @@ def test_cuda_short_vector(): print(lp.generate_code_v2(knl).device_code()) +def test_nvidia_pyopencl_target(ctx_factory): + ctx = ctx_factory() + if ctx.devices[0].vendor != 'NVIDIA Corporation': + # do not test for non-Nvidia devices + return + + queue = cl.CommandQueue(ctx) + a = np.random.randn(16) + + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + """ + res[0] = res[0] + a[i] {id=update, atomic} + """, + [ + lp.GlobalArg('res', for_atomic=True), + lp.GlobalArg('a', for_atomic=True, dtype=np.float64), + '...']) + + knl = lp.split_iname(knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + knl = knl.copy(target=lp.NvidiaPyOpenCLTarget(ctx.devices[0])) + + evt, (out, ) = knl(queue, a=a) + assert np.isclose(out, a.sum()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index 6eb6697b5c192911864000781381244dfcbef631..9300f45c33b80195facc70a44c360363a69b2396 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -23,6 +23,7 @@ THE SOFTWARE. """ import sys +import six import numpy as np import loopy as lp import pyopencl as cl @@ -127,7 +128,7 @@ def test_to_batched(ctx_factory): def test_to_batched_temp(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( ''' { [i,j]: 0<=i,j a[j] = j {inames=i:j} + """) + + prog = lp.save_temporaries_in_loop(prog, 'i', ['a']) + assert prog.root_kernel.temporary_variables['a'].shape == (4, 4) + + def test_add_barrier(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -255,18 +268,17 @@ def test_vectorize(ctx_factory): a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) - knl = lp.set_array_dim_names(knl, "a,b", "i") + knl = lp.set_array_axis_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) - knl = lp.tag_data_axes(knl, "a,b", "c,vec") + knl = lp.tag_array_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( @@ -275,19 +287,19 @@ def test_vectorize(ctx_factory): def test_extract_subst(ctx_factory): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=itmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -492,28 +507,34 @@ def test_add_nosync(): tmp5[i] = 1 {id=insn6,conflicts=g1} """) - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog.root_kernel.id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -522,28 +543,30 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", []) + new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_root_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in knl.instructions) + insn_ids = set(insn.id for insn in prog.root_kernel.instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} """) - knl = lp.split_iname(knl, "i", 4, within='id:to_split') + prog = lp.split_iname(prog, "i", 4, within='id:to_split') - for insn in knl.instructions: + for insn in prog.root_kernel.instructions: if insn.id == 'to_split': assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) if insn.id == 'not_to_split': @@ -554,7 +577,7 @@ def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp - ref_knl = lp.make_kernel( + ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x @@ -564,10 +587,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - knl = lp.expand_subst(ref_knl) - assert not knl.substitutions + prg = lp.expand_subst(ref_prg) + assert not any( + cknl.subkernel.substitutions + for cknl in six.itervalues(prg.callables_table.resolved_functions)) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_prg, ctx, prg) def test_extract_subst_with_iname_deps_in_templ(ctx_factory): diff --git a/test/testlib.py b/test/testlib.py index 67c5ba04fefde9a7516f22bce679744ce61a4f20..c66367a7ccaeae2794c3f5ffa82e5670ada721c2 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,8 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, callables_table, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -131,4 +133,48 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker