diff --git a/doc/index.rst b/doc/index.rst index b77bbb16f413defe5010c75d28464051553b4486..9a10116d916468fd46b9b23ad113f3d9085ae699 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 0000000000000000000000000000000000000000..5a59e84282119209cc89eb18e3a4eda97725edf0 --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,193 @@ +Calling Loopy Kernels and External Functions +============================================ + +Goals of a function interface +----------------------------- + +- *FIXME: * Needs to change after the new design of program. + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ResolvedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". + +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a +:attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ResolvedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ResolvedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ResolvedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ResolvedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3c85060dacf03b52f6e0b1faf05ad4697b6a5d07..2a9756b207f31968b47b86279ef1aa71b2a1e46a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -334,7 +334,7 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl.root_kernel.stringify(with_dependencies=True)) --------------------------------------------------------------------------- KERNEL: loopy_kernel --------------------------------------------------------------------------- @@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1207,8 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1237,10 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1279,7 +1281,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1906f703b0efc39808ff68a63b91ff37..884fb0bd19c48fe15bd12898e4b684990c986ff4 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table) # map schedule onto host or device print(knl) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index fa581d4262e2f06addf81aeaecca5ed2f8f8c8f1..90f31f0946d06edf5565e744b9080c59c66818ca 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -29,8 +29,6 @@ def transform(knl, vars, stream_dtype): def gen_code(knl): - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code() diff --git a/examples/python/sparse.py b/examples/python/sparse.py index 0e56df1bc3085976bfadd783f976fa912af45da8..7791f41ba808b5fc822277096cf079a5fe6435b5 100644 --- a/examples/python/sparse.py +++ b/examples/python/sparse.py @@ -14,6 +14,6 @@ k = lp.make_kernel([ """) k = lp.add_and_infer_dtypes(k, { - "values,x": np.float64, "rowstarts,colindices": k.index_dtype + "values,x": np.float64, "rowstarts,colindices": k.root_kernel.index_dtype }) -print(lp.generate_code(k)[0]) +print(lp.generate_code_v2(k).device_code()) diff --git a/loopy/__init__.py b/loopy/__init__.py index d69a57bf1a5435adfb067df5cfb2080633cac765..9c420166228af255db94774fc94f3ae66aa0e345 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -29,13 +29,10 @@ from six.moves import range, zip from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning - +from loopy.program import iterate_over_kernels_if_given_program # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( MemoryOrdering, memory_ordering, MemoryScope, memory_scope, @@ -51,6 +48,10 @@ from loopy.kernel.data import ( TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.program import ( + CallablesTable, Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -63,7 +64,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -119,10 +120,14 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.callable import ( + register_function_id_to_in_knl_callable_mapper) + # }}} from loopy.type_inference import infer_unknown_types -from loopy.preprocess import preprocess_kernel, realize_reduction +from loopy.preprocess import (preprocess_kernel, realize_reduction, + preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, @@ -170,6 +175,10 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", "CallableKernel", + + "CallablesTable", "Program", "make_program", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated @@ -177,9 +186,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "default_function_mangler", "single_arg_function_mangler", - - "make_kernel", "UniqueName", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", @@ -232,6 +239,8 @@ __all__ = [ "add_barrier", + "register_function_id_to_in_knl_callable_mapper", + # }}} "get_dot_dependency_graph", @@ -247,7 +256,7 @@ __all__ = [ "infer_unknown_types", - "preprocess_kernel", "realize_reduction", + "preprocess_kernel", "realize_reduction", "preprocess_program", "generate_loop_schedules", "get_one_scheduled_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", @@ -297,6 +306,7 @@ __all__ = [ # {{{ set_options +@iterate_over_kernels_if_given_program def set_options(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional @@ -304,6 +314,7 @@ def set_options(kernel, *args, **kwargs): See also :class:`Options`. """ + assert isinstance(kernel, LoopKernel) if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -335,6 +346,7 @@ def set_options(kernel, *args, **kwargs): # {{{ library registration +@iterate_over_kernels_if_given_program def register_preamble_generators(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` @@ -359,6 +371,7 @@ def register_preamble_generators(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) +@iterate_over_kernels_if_given_program def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -376,6 +389,7 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) +@iterate_over_kernels_if_given_program def register_function_manglers(kernel, manglers): """ :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` @@ -441,7 +455,7 @@ class CacheMode(object): # {{{ make copy kernel def make_copy_kernel(new_dim_tags, old_dim_tags=None): - """Returns a :class:`LoopKernel` that changes the data layout + """Returns a :class:`loopy.Program` that changes the data layout of a variable (called "input") to the new layout specified by *new_dim_tags* from the one specified by *old_dim_tags*. *old_dim_tags* defaults to an all-C layout of the same rank diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 92c01addb27dd232fd41f40048cb15d5510d765a..7e23ef06f669f660a6af57f8594be4fa8a45061f 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -29,6 +29,7 @@ from warnings import warn import numpy as np import loopy as lp + from loopy.diagnostic import LoopyError, AutomaticTestFailure @@ -75,7 +76,7 @@ class TestArgInfo(Record): # {{{ "reference" arguments -def make_ref_args(kernel, impl_arg_info, queue, parameters): +def make_ref_args(program, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -88,7 +89,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_arg_data = [] for arg in impl_arg_info: - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + kernel_arg = program.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: @@ -117,7 +118,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel.get_written_variables() + is_output = kernel_arg.is_output_only if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( @@ -366,7 +367,7 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): # {{{ main automatic testing entrypoint def auto_test_vs_ref( - ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, + ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, @@ -383,24 +384,26 @@ def auto_test_vs_ref( import pyopencl as cl - if test_knl is None: - test_knl = ref_knl + if test_prog is None: + test_prog = ref_prog do_check = False - if len(ref_knl.args) != len(test_knl.args): - raise LoopyError("ref_knl and test_knl do not have the same number " + ref_prog = lp.preprocess_kernel(ref_prog) + test_prog = lp.preprocess_kernel(test_prog) + + if len(ref_prog.args) != len(test_prog.args): + raise LoopyError("ref_prog and test_prog do not have the same number " "of arguments") - for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): + for i, (ref_arg, test_arg) in enumerate(zip(ref_prog.args, test_prog.args)): if ref_arg.name != test_arg.name: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): @@ -421,7 +424,7 @@ def auto_test_vs_ref( # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types - ref_knl = infer_unknown_types(ref_knl, expect_completion=True) + ref_prog = infer_unknown_types(ref_prog, expect_completion=True) found_ref_device = False @@ -431,30 +434,25 @@ def auto_test_vs_ref( ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + ref_codegen_result = lp.generate_code_v2(ref_prog) - pp_ref_knl = lp.preprocess_kernel(ref_knl) - - for knl in lp.generate_loop_schedules(pp_ref_knl): - ref_sched_kernel = knl - break + ref_implemented_data_info = ref_codegen_result.implemented_data_info logger.info("%s (ref): trying %s for the reference calculation" % ( - ref_knl.name, dev)) + ref_prog.name, dev)) - ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_code(ref_compiled.get_code())) + print(get_highlighted_code( + ref_codegen_result.device_code())) print(75*"-") - ref_kernel_info = ref_compiled.kernel_info(frozenset()) - try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, - ref_kernel_info.implemented_data_info, + make_ref_args(ref_prog, + ref_implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -479,13 +477,13 @@ def auto_test_vs_ref( ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( - ref_knl.name, dev)) - logger.info("%s (ref): run" % ref_knl.name) + ref_prog.name, dev)) + logger.info("%s (ref): run" % ref_prog.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: - ref_evt, _ = ref_compiled(ref_queue, **ref_args) + ref_evt, _ = ref_prog(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) @@ -493,7 +491,7 @@ def auto_test_vs_ref( ref_stop = time() ref_elapsed_wall = ref_stop-ref_start - logger.info("%s (ref): run done" % ref_knl.name) + logger.info("%s (ref): run done" % ref_prog.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) @@ -514,175 +512,136 @@ def auto_test_vs_ref( queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - from loopy.kernel import KernelState - from loopy.target.pyopencl import PyOpenCLTarget - if test_knl.state not in [ - KernelState.PREPROCESSED, - KernelState.SCHEDULED]: - if isinstance(test_knl.target, PyOpenCLTarget): - test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) - - test_knl = lp.preprocess_kernel(test_knl) - - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) - else: - test_kernels = [test_knl] - - test_kernel_count = 0 - from loopy.type_inference import infer_unknown_types - for i, kernel in enumerate(test_kernels): - test_kernel_count += 1 - if test_kernel_count > max_test_kernel_count: - break - kernel = infer_unknown_types(kernel, expect_completion=True) - - compiled = CompiledKernel(ctx, kernel) + test_prog = infer_unknown_types(test_prog, expect_completion=True) + test_prog_codegen_result = lp.generate_code_v2(test_prog) + + args = make_args(test_prog, + test_prog_codegen_result.implemented_data_info, + queue, ref_arg_data, parameters) + args["out_host"] = False + + if not quiet: + print(75*"-") + print("Kernel:") + print(75*"-") + if print_code: + print(get_highlighted_code( + test_prog_codegen_result.device_code())) + print(75*"-") + if dump_binary: + print(type(test_prog_codegen_result.cl_program)) + print(test_prog_codegen_result.cl_program.binaries[0]) + print(75*"-") - kernel_info = compiled.kernel_info(frozenset()) + logger.info("%s: run warmup" % (test_prog.name)) - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) + for i in range(warmup_rounds): + if not AUTO_TEST_SKIP_RUN: + test_prog(queue, **args) - args["out_host"] = False + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - # {{{ find cl program + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - for name in dir(kernel_info.cl_kernels): - if name.startswith("__"): - continue - cl_kernel = getattr(kernel_info.cl_kernels, name) - cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) - break - else: - assert False, "could not find cl_program" + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - # }}} + need_check = False - print(type(cl_program)) - if hasattr(cl_program, "binaries"): - print(cl_program.binaries[0]) + events = [] + queue.finish() - print(75*"-") + logger.info("%s: warmup done" % (test_prog.name)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_prog.name)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = warmup_rounds - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = warmup_rounds + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_prog.name)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - - print("elapsed: %s s event, %s s marker-event %s s wall " - "(%d rounds)%s" % ( - format_float_or_none(elapsed_event), - format_float_or_none(elapsed_event_marker), - format_float_or_none(elapsed_wall), timing_rounds, rates)) - - if do_check: - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) - if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} diff --git a/loopy/check.py b/loopy/check.py index c31304d87494cbda2a15300b42c6503bceed53d1..64cf80a4e74c38ebac0b29cd0dc460c030baa130 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -56,6 +60,68 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ResolvedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_resolved(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unsupported instruction type %s." % type(insn).__name__) + # }}} @@ -114,6 +180,18 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns an instance of :class:`set` of all the iname tags used in + *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -128,8 +206,10 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, callables_table): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -142,6 +222,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = callables_table[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: @@ -387,11 +482,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " @@ -616,13 +712,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -650,7 +746,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -665,7 +762,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + callables_table) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -682,7 +780,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -733,9 +832,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table) # }}} @@ -889,15 +989,15 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/cli.py b/loopy/cli.py index a92922b1845d76dd7a700a93c05de3eecf8c28dd..ed50cec1fdf19d58df741cca5765822836d61f2e 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -108,9 +108,11 @@ def main(): ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", + ".F90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", + ".F77": "fortran", }.get(ext) with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() @@ -205,7 +207,7 @@ def main(): new_kernels = [] for kernel in kernels: new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) + lp.GlobalArg("occa_info", np.int32, shape=None) ] + kernel.args new_kernels.append(kernel.copy(args=new_args)) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..d8a7effcc49b0eae2ea19b5ed283c5ca7334a6e0 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,10 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from loopy.kernel.function_interface import CallableKernel +from cgen import Collection + + import logging logger = logging.getLogger(__name__) @@ -187,12 +191,17 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: callables_table + + An instance of :class:`loopy.CallablesTable`. """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + callables_table, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -206,6 +215,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.callables_table = callables_table self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -253,6 +263,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + callables_table=self.callables_table, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -374,19 +385,19 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, callables_table): """ :returns: a :class:`CodeGenerationResult` + + :param kernel: An instance of :class:`loopy.LoopKernel`. + :param callables_table: An instance of + :class:`loopy.CallablesTable`. """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -407,11 +418,8 @@ def generate_code_v2(kernel): # }}} - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, callables_table) logger.info("%s: generate code: start" % kernel.name) @@ -470,9 +478,11 @@ def generate_code_v2(kernel): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + callables_table=callables_table) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -524,6 +534,68 @@ def generate_code_v2(kernel): return codegen_result +def generate_code_v2(program): + """ + Returns an instance of :class:`CodeGenerationResult`. + + :param program: An instance of :class:`loopy.Program`. + """ + from loopy.kernel import LoopKernel + from loopy.program import make_program + + if isinstance(program, LoopKernel): + program = make_program(program) + + from loopy.kernel import KernelState + if program.root_kernel.state == KernelState.INITIAL: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + codegen_results = {} + + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + codegen_results[func_id] = ( + generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.callables_table)) + if not in_knl_callable.subkernel.is_called_from_host: + assert codegen_results[func_id].host_program is None + + device_preambles = [] + for cgr in codegen_results.values(): + device_preambles.extend(cgr.device_preambles) + + # collecting the function declarations of callee kernels + for in_knl_callable in program.callables_table.values(): + for preamble in in_knl_callable.generate_preambles(program.target): + device_preambles.append(preamble) + + collective_device_program = codegen_results[program.name].device_programs[0] + callee_fdecls = [] + + for func_id, callee_cgr in codegen_results.items(): + if func_id != program.name: + assert len(callee_cgr.device_programs) == 1 + callee_prog_ast = callee_cgr.device_programs[0].ast + collective_device_program = collective_device_program.copy( + ast=Collection([callee_prog_ast, collective_device_program.ast])) + callee_fdecls.append(callee_prog_ast.fdecl) + + for callee_fdecl in callee_fdecls: + collective_device_program = collective_device_program.copy( + ast=Collection([callee_fdecl, collective_device_program.ast])) + + collective_device_programs = [collective_device_program] + ( + codegen_results[program.name].device_programs[1:]) + + return codegen_results[program.name].copy( + device_programs=collective_device_programs, + device_preambles=device_preambles) + + def generate_code(kernel, device=None): if device is not None: from warnings import warn diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..e17dd55b8229b7d6d05b2a1497228a46812449bf 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -115,17 +115,21 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.callables_table) + if kernel.is_called_from_host: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for callee kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 58f055b7b5042ff28f7bf9674b0e7dc5ff1b6269..205935c1b8bd506f43e514fd9ee15da54d3b3ede 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.callables_table) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c1b16deeaac98f8408d5ca82f2de1714..268a70b23981ead4a1828e52efdccacf39814dc2 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code) or ( + codegen_state.kernel.is_called_from_host): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_program=None) return codegen_result diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 05b0a92050a51be1cd980648325921fbf13768d8..0434f4e90eee4c98c726fbf1d3598eb736af6d99 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -263,7 +263,7 @@ def parse_fortran(source, filename="", free_form=True, strict=True, from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, - analyze=False, ignore_comments=False) + analyze=False, ignore_comments=False, filename=filename) if tree is None: raise LoopyError("Fortran parser was unhappy with source code " diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 42d7c0f1e2062f84ee171c8c6274a0aa74601a4a..679944acb01c7d306c15bf644626f610e32daa20 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,12 +37,8 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError -from loopy.tools import natsorted +from loopy.tools import natsorted, update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -224,6 +220,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: target A subclass of :class:`loopy.TargetBase`. + + .. attribute:: is_called_from_host + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from other top level kernels. Default value is + *True*. + """ # {{{ constructor @@ -251,6 +253,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): @@ -277,15 +280,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -302,7 +297,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ process assumptions - if assumptions is None: + if assumptions is None and domains: dom0_space = domains[0].get_space() assumptions_space = isl.Space.params_alloc( dom0_space.get_ctx(), dom0_space.dim(dim_type.param)) @@ -312,6 +307,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): dom0_space.get_dim_name(dim_type.param, i)) assumptions = isl.BasicSet.universe(assumptions_space) + elif assumptions is None and not domains: + assumptions = isl.BasicSet.read_from_str( + isl.DEFAULT_CONTEXT, "[] -> { : 1 = 1}") + elif isinstance(assumptions, str): assumptions_set_str = "[%s] -> { : %s}" \ % (",".join(s for s in self.outer_params(domains)), @@ -372,6 +371,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -380,7 +380,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -1040,19 +1040,19 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. - :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, + callables_table, ignore_auto=ignore_auto) all_inames_by_insns = set() @@ -1137,7 +1137,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + @memoize_method + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1148,7 +1150,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, callables_table, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1156,7 +1158,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1164,9 +1166,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, callables_table, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1175,6 +1179,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), + callables_table, ignore_auto=ignore_auto) # }}} @@ -1365,47 +1370,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.program import make_program + program = make_program(self) + return program(*args, **kwargs) # }}} @@ -1506,14 +1478,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash def __hash__(self): from loopy.tools import LoopyKeyBuilder diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 3588f38af13479b127208c25735f1046eaa82706..d079aebe5a58e49839b1416afa71b2256dbc8ce0 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -835,6 +835,7 @@ class ArrayBase(ImmutableRecord): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, **kwargs) def __eq__(self, other): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 5ba147dfdb2e6e9a4bd30ee1b37c31b002da9982..20ffae874464e26f3681804beec1de606cbf3562 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin @@ -33,7 +32,7 @@ from loopy.symbolic import IdentityMapper, WalkMapper from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1134,8 +1133,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() import loopy as lp - - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg if arg_name in self.all_params: return ValueArg(arg_name) @@ -1659,7 +1657,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions_in_single_kernel from loopy.match import MatchExpressionBase new_deps = [] @@ -1668,7 +1666,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions(knl, dep): + for new_dep in find_instructions_in_single_kernel(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True @@ -1949,6 +1947,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) + is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -2160,15 +2159,24 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_args_are_output_only + knl = infer_args_are_output_only(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + if is_callee_kernel: + return knl + else: + from loopy.program import make_program + return make_program(knl) + - return knl +def make_function(*args, **kwargs): + kwargs['is_callee_kernel'] = True + return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index dd68c950e04e29fc99c456412d9dc4a53dbc61b2..430eab82725dd4b68fcb3f4e5ae2af0425ed0152 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -338,6 +338,7 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) ImmutableRecord.__init__(self, **kwargs) @@ -363,7 +364,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -403,6 +404,9 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 + # Constant Arg cannot be an output + is_output_only = False + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..1803efdb281f2e33ccf2f198ce768295f3944c2f --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,616 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from six.moves import zip + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable +.. autoclass:: ManglerCallable + +""" + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + update_persistent_hash = update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + """ + + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import FixedStrideArrayDimTag + + assert isinstance(shape, tuple) + assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) + + # }}} + + super(ArrayArgDescriptor, self).__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + hash_fields = ( + "shape", + "address_space", + "dim_tags") + + update_persistent_hash = update_persistent_hash + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if not arg.is_output_only: + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + else: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer to + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. note:: + + This class acts as a pseudo-callable and its significance lies in + solving picklability issues. + """ + fields = set(["local_size", "global_size"]) + + def __init__(self, local_size, global_size): + self.local_size = local_size + self.global_size = global_size + + def __call__(self, insn_ids, callables_table, ignore_auto=True): + return self.local_size, self.global_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within expressions in + a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types of the + callable. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. + + .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. + + - Negative "arg_id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + update_persistent_hash = update_persistent_hash + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, callables_table): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, local_size, global_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the program in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ + Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstract interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the function and is expected to be supplemented in the + derived subclasses. + """ + + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + hash_fields = fields + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(ScalarCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, callables_table): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a pymbolic call for C-based targets, when the instructions + involve multiple return values along with the required type casting. + The first assignee is returned, but the rest of them are appended to + the parameters and passed by reference. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + """ + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + assignee_is_returned = len(assignees) > 0 + + return var(self.name_in_target)(*c_parameters), assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields = fields + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + + super(CallableKernel, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + return + yield + +# }}} + + +# {{{ mangler callable + +class ManglerCallable(ScalarCallable): + """ + A callable whose characteristic is defined by a function mangler. + + .. attribute:: function_mangler + + A function of signature ``(kernel, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for arg_id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return ( + self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name)) + + def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel, self.name, arg_dtypes) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 5dee96e75d36eace37efd1ea1fcaa98cfef7d0ec..57920c79966be2578cdcda1e53bd2f73154105fd 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -507,6 +507,7 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -949,12 +950,12 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=f(self.assignee, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1060,9 +1061,10 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1104,12 +1106,12 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=f(self.assignees, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 2f32d94f662fe59d23ffb06dd13ade4a057304aa..c9dae7c1a7c299f030275e07cc9026e954085d6e 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.program import Program, iterate_over_kernels_if_given_program import logging logger = logging.getLogger(__name__) @@ -43,19 +44,25 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(knl, dtype_dict): +def add_dtypes(program, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) + root_kernel = program.root_kernel + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + root_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) + root_kernel - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + root_kernel_with_added_dtypes = ( + root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) + + return program.with_root_kernel(root_kernel_with_added_dtypes) def _add_dtypes_overdetermined(knl, dtype_dict): @@ -107,7 +114,8 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): + assert isinstance(prog, Program) processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -116,10 +124,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - knl = add_dtypes(knl, processed_dtype_dict) + prog = add_dtypes(prog, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): @@ -455,7 +463,9 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): +@iterate_over_kernels_if_given_program +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -467,7 +477,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn @@ -748,7 +758,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -762,7 +772,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -790,6 +800,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), + callables_table, axis=recursion_axis) if axis is None: @@ -829,7 +840,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -840,6 +852,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -861,7 +874,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -929,7 +942,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} @@ -1855,7 +1869,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_output_only(kernel): """ Returns a copy of *kernel* with the attribute ``is_output_only`` set. @@ -1867,6 +1881,7 @@ def infer_arg_is_output_only(kernel): """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): if arg.is_output_only is not None: diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9fe5c4c040608dc181b96daa812405a65..f225b62f9f77b889c7137d69ff7e3944268641fa 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,38 +22,58 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - return None + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), callables_table) + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + callables_table) - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - return None +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + new_arg_id_to_dtype = dict((i, dtype) for i, dtype in + arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype[-1] = kernel.index_dtype + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - return None +def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` for the *idenitifer* + which is not present in *target*, but whose interface is given by + :mod:`loo.py`. Callables that fall in this category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import ( + reduction_func_id_to_in_knl_callable_mapper) + return reduction_func_id_to_in_knl_callable_mapper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114ddeb9d48eb33a765755302917ca27f63..e59a892bb4c7b3bd7222bf61b29e0ade92195240 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,77 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + callables_table) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + callables_table) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_id_to_in_knl_callable_mapper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 2658b8cd743d335323dab7dd9aebd82ef5830652..357c03febfd351c71a19742c0763a3007c9e9f63 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,11 +24,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.tools import update_persistent_hash class ReductionOperation(object): @@ -81,6 +84,9 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) + def get_scalar_callables(self): + return frozenset() + class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -180,7 +186,10 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ResolvedFunction("max")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["max"]) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +197,10 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ResolvedFunction("min")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["min"]) # {{{ base class for symbolic reduction ops @@ -212,6 +224,10 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = update_persistent_hash # }}} @@ -245,7 +261,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -262,7 +278,10 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset(["make_tuple", SegmentedOp(self)]) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -270,34 +289,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = update_persistent_hash # }}} @@ -330,7 +339,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -347,7 +356,10 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset([self.which, "make_tuple", ArgExtOp(self)]) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -355,43 +367,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = update_persistent_hash # }}} @@ -446,70 +438,93 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, callables_table): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), callables_table + + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table) + + def generate_preambles(self, target): + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_func_id_to_in_knl_callable_mapper(target, identifier): + if isinstance(identifier, ReductionOpFunction): + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 4592463822a2321745aaf48a316d16c98d4efca3..66d413987466e98e5a188df93cad49f5584cd3f7 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -25,6 +25,7 @@ THE SOFTWARE. import islpy as isl import six +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): @@ -55,6 +56,7 @@ def potential_loop_nest_map(kernel): return result +@iterate_over_kernels_if_given_program def fuse_loop_domains(kernel): from loopy.kernel.tools import is_domain_dependent_on_inames diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2afcd3db4331d57e1e61c48ba521ebaa296ddbb2..85b0c6d4893b7e3f6c86d484de521e10ce65848c 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,7 +27,6 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -37,13 +36,19 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import RuleAwareIdentityMapper +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) # {{{ prepare for caching +@iterate_over_kernels_if_given_program def prepare_for_caching(kernel): import loopy as lp new_args = [] @@ -885,9 +890,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction_for_single_kernel(kernel, callables_table, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1007,7 +1012,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1125,7 +1130,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1365,7 +1370,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1454,17 +1459,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1663,15 +1668,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( + arg_dtypes, reduction_dtypes, callables_table = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1780,15 +1785,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1807,12 +1814,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1845,9 +1853,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + callables_table=callables_table, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = ( + cb_mapper(insn.expression, + callables_table=callables_table),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1935,6 +1947,31 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return kernel + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = realize_reduction_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # }}} @@ -2108,17 +2145,160 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Infers the :attr:`loopy` + """ + + def __init__(self, rule_mapping_context, caller_kernel, + callables_table): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.callables_table = callables_table + + def map_call(self, expr, expn_state, **kwargs): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ResolvedFunction + + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + + if isinstance(expr, Call): + kw_parameters = {} + else: + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters + + # descriptors for the args and kwargs of the Call + arg_id_to_descr = dict((i, ValueArgDescriptor()) + for i, par in tuple(enumerate(expr.parameters)) + + tuple(kw_parameters.items())) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + in_knl_callable = self.callables_table[expr.function.name] + new_in_knl_callable, self.callables_table = ( + in_knl_callable.with_descrs( + combined_arg_id_to_descr, self.callables_table)) + self.callables_table, new_func_id = ( + self.callables_table.with_callable( + expr.function.function, + new_in_knl_callable)) + + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(kw_parameters)) + ) + + map_call_with_kwargs = map_call + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn, assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, callables_table): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, callables_table) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table + + +def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ + root_kernel_callable = program.callables_table[program.name] + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel = program.root_kernel + + new_root_kernel, callables_table = traverse_to_infer_arg_descr( + root_kernel, callables_table) + new_root_kernel_callable = root_kernel_callable.copy( + subkernel=new_root_kernel) + callables_table, _ = callables_table.with_callable(program.name, + new_root_kernel_callable) + + callables_table = callables_table.with_exit_edit_callables_mode( + old_callables_count) + + return program.copy(callables_table=callables_table) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) - +def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2161,8 +2341,6 @@ def preprocess_kernel(kernel, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2177,8 +2355,8 @@ def preprocess_kernel(kernel, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - - kernel = realize_reduction(kernel, unknown_types_ok=False) + kernel = realize_reduction_for_single_kernel(kernel, + callables_table, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2222,4 +2400,94 @@ def preprocess_kernel(kernel, device=None): return kernel + +# {{{ hw axes inference + +def infer_hw_axes_sizes(program): + """ + Returns copy of *program* with the hardware axes sizes inferred. + + .. note:: + + - Firstly, computes the collective hardware axes sizes from all the + callable kernels. + - Then, overrides the grid sizes of all the callable kernels to the + collective value. + """ + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_inferred = {} + + for func_id, in_knl_callable in ( + program.callables_table.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_callables_table = ( + program.callables_table.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) + + return program.copy(callables_table=new_callables_table) + +# }}} + + +def preprocess_program(program, device=None): + + if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + # {{{ preprocess callable kernels + + # Callable editing restrictions: + # + # - should not edit callables_table in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] + # + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.callables_table, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + program = program.copy(callables_table=new_callables_table) + + # }}} + + # infer arg descrs of the callables + program = infer_arg_descr(program) + + program = infer_hw_axes_sizes(program) + + return program + + +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program + + # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 0000000000000000000000000000000000000000..c8534f0511353da45977ab282df18a585b63e632 --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,983 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import re + +from pytools import ImmutableRecord, memoize_method +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleExpander) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction + +from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash +from collections import Counter +from pymbolic.primitives import Call, CallWithKwargs + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program +.. autoclass:: CallablesTable + +.. autofunction:: make_program +.. autofunction:: iterate_over_kernels_if_given_program + +""" + + +class ResolvedFunctionMarker(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel, callables_table, + function_id_to_in_knl_callable_mappers): + super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + self.kernel = kernel + self.callables_table = callables_table + self.function_id_to_in_knl_callable_mappers = ( + function_id_to_in_knl_callable_mappers) + + def find_in_knl_callable_from_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for func_id_to_in_knl_callable_mapper in ( + self.function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + self.kernel.target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + def map_call(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if not isinstance(expr.function, ResolvedFunction): + + # search the kernel for the function. + in_knl_callable = self.find_in_knl_callable_from_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ResolvedFunction with the + # resolved in-kernel callable + + self.callables_table, new_func_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) + return type(expr)( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + for func_id in ( + expr.operation.get_scalar_callables()): + in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) + assert in_knl_callable is not None + self.callables_table, _ = ( + self.callables_table.with_added_callable(func_id, + in_knl_callable)) + return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + + +def _default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + from loopy.library.function import ( + loopy_specific_callable_func_id_to_knl_callable_mappers) + return ( + [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( + target.get_device_ast_builder().function_id_in_knl_callable_mapper( + ))) + + +def initialize_callables_table_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.CallablesTable`, by resolving + the functions based on :mod:`loopy`'s default function resolvers. + """ + # collect the default function resolvers + func_id_to_kernel_callable_mappers = ( + _default_func_id_to_kernel_callable_mappers(kernel.target)) + callables_table = CallablesTable({}) + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, callables_table, + func_id_to_kernel_callable_mappers) + + # mark the functions as "Resolved" in the expression nodes. + kernel_with_functions_resolved = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + # collect the update callables_table + callables_table = resolved_function_marker.callables_table + + callable_kernel = CallableKernel(kernel_with_functions_resolved) + + # add the callable kernel to the callables_table + callables_table, _ = callables_table.with_added_callable( + Variable(kernel.name), callable_kernel) + + return callables_table + + +# {{{ program definition + +class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: name + + An instance of :class:`str`, also the name of the top-most level + :class:`loopy.LoopKernel`. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A list of functions of the signature ``(target: TargetBase, + function_indentifier: str)`` that would return an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommended to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. automethod:: with_root_kernel + """ + def __init__(self, + name, + callables_table, + target, + func_id_to_in_knl_callable_mappers): + assert isinstance(callables_table, CallablesTable) + + assert name in callables_table + + super(Program, self).__init__( + name=name, + callables_table=callables_table, + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "name", + "callables_table", + "target",) + + update_persistent_hash = update_persistent_hash + + def copy(self, **kwargs): + if 'target' in kwargs: + # target attribute of all the callable kernels should be updated. + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.callables_table.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + callables_table = new_self.callables_table.copy( + resolved_functions=new_resolved_functions) + + return super(Program, new_self).copy( + callables_table=callables_table) + else: + return super(Program, self).copy(**kwargs) + + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.root_kernel.get_grid_size_upper_bounds( + self.callables_table, + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( + self.callables_table, + ignore_auto=ignore_auto) + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + + @property + def root_kernel(self): + """ + Returns an instance of :class:`loopy.LoopKernel` denoting the topmost + level kernel. + + .. note:: + + Syntactic sugar. + """ + return self.callables_table[self.name].subkernel + + @property + def arg_dict(self): + """ + Returns ``arg_dict`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ + return self.root_kernel.arg_dict + + @property + def args(self): + """ + Returns ``args`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ + return self.root_kernel.args[:] + + def with_root_kernel(self, root_kernel): + """ + Returns a copy of *self* with the topmost level kernel as + *root_kernel*. + """ + new_in_knl_callable = self.callables_table[ + self.name].copy(subkernel=root_kernel) + new_resolved_functions = ( + self.callables_table.resolved_functions.copy()) + new_resolved_functions[self.name] = new_in_knl_callable + + return self.copy( + callables_table=self.callables_table.copy( + resolved_functions=new_resolved_functions)) + + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + return self.root_kernel.__str__() + +# }}} + + +def next_indexed_function_identifier(function_id): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``'sin_0'`` will return ``'sin_1'``. + + :arg function_id: Either an instance of :class:`str`. + """ + + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function_id) + + if match is None: + if function_id[-1] == '_': + return "{old_name}0".format(old_name=function_id) + else: + return "{old_name}_0".format(old_name=function_id) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ + def __init__(self, rule_mapping_context, renaming_dict): + super(ResolvedFunctionRenamer, self).__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super(ResolvedFunctionRenamer, self).map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + +# {{{ counting helpers + +class CallablesCountingMapper(CombineMapper): + """ + Returns an instance of :class:`collections.Counter` with the count of + callables registered in *callables_table*. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + """ + def __init__(self, callables_table): + self.callables_table = callables_table + + def combine(self, values): + return sum(values, Counter()) + + def map_call(self, expr): + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + if isinstance(expr.function, (ResolvedFunction)): + in_knl_callable = self.callables_table[expr.function.name] + if isinstance(in_knl_callable, ScalarCallable): + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + elif isinstance(in_knl_callable, CallableKernel): + + # callable kernels have more callables in them. + callables_count_in_subkernel = ( + count_callables_in_kernel( + in_knl_callable.subkernel, + self.callables_table)) + + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + ( + callables_count_in_subkernel) + else: + raise NotImplementedError("Unknown callable type %s." % ( + type)) + else: + return ( + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call + + def map_reduction(self, expr): + return Counter(expr.operation.get_scalar_callables()) + ( + super(CallablesCountingMapper, self).map_reduction(expr)) + + def map_constant(self, expr): + return Counter() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +@memoize_method +def count_callables_in_kernel(kernel, callables_table): + """ + Returns an instance of :class:`collections.Counter` representing the number + of callables in the *kernel* that are registered in + *callables_table*. + """ + assert isinstance(kernel, LoopKernel) + callables_count = Counter() + callables_counting_mapper = CallablesCountingMapper( + callables_table) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_count += ( + callables_counting_mapper(subst_expander( + insn.expression))) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction type %s." % ( + type(insn))) + + return callables_count + +# }}} + + +# {{{ program callables info + +class CallablesTable(ImmutableRecord): + # FIXME: is CallablesTable a better name?(similar to symbol table in + # compilers.) + """ + Records the information of all the callables called in a :class:`loopy.Program`. + + .. attribute:: resolved_functions + + An instance of :class:`dict` that contains a mapping from function + identifier to instances of + :class:`loopy.kernel.function_interface.InKernelCallable` + + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) + + .. attribute:: is_being_edited + + An instance of :class:`bool` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + + .. automethod:: __init__ + .. automethod:: callables_count + .. automethod:: with_added_callable + .. automethod:: with_edit_callables_mode + .. automethod:: with_callable + .. automethod:: with_exit_edit_callables_mode + """ + def __init__(self, resolved_functions, + history=None, is_being_edited=False): + + if history is None: + history = dict((func_id, frozenset([func_id])) for func_id in + resolved_functions) + + super(CallablesTable, self).__init__( + resolved_functions=resolved_functions, + history=history, + is_being_edited=is_being_edited) + + hash_fields = ( + "resolved_functions", + "is_being_edited", + "history") + + def __hash__(self): + return hash(( + frozenset(six.iteritems(self.resolved_functions)), + frozenset(six.iteritems(self.history)), + self.is_being_edited + )) + + update_persistent_hash = update_persistent_hash + + @property + @memoize_method + def callables_count(self): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in callables_table. + """ + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(self[ + root_kernel_name].subkernel, self)) + + return callables_count + + # {{{ interface to perform edits on callables + + def with_added_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` of ``(new_self, new_function)``. + ``new_self`` is a copy of *self* with the *function* associated with the + *in_kernel_callable*. ``new_function`` is the function identifier that + should be noted in the expression node so that it could be associated + with an instance of :class:`InKernelCallable`. + + .. note:: + + - Always checks whether the + :attr:``loopy.CallablesTable.resolved_functions` has + *in_kernel_callable*, does not introduce copies. + + - The difference between + :meth:`loopy.CallablesTable.with_added_callable` + and :meth:`CallablesTable.with_callable` being that + the former has no support for renaming the callable back i.e. + ``with_callable`` supports renaming from ``sin_0`` to ``sin``, + if possible, through the member method + ``loopy.CallablesTable.with_exit_edit_callables_mode`` + + This subtle difference makes -- + + - :meth:`loopy.CallablesTable.with_added_callable` suitable + for usage while resolving the functions first time, where no + renaming is needed. + + - :meth:`loopy.CallablesTable.with_callable` suitable for + implementing edits in callables during inference-walks. + """ + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | frozenset([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) + + def with_edit_callables_mode(self): + """ + Returns a copy of *self* for a walk traversal through all the callables. + """ + return self.copy( + is_being_edited=True) + + def with_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Also refer -- :meth:`loopy.CallablesTable.with_added_callable` + + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callable: An instance of + :class:`loopy.InKernelCallable`. + + .. note:: + + - Use :meth:`with_added_callable` if a callable is being resolved for the + first time. + """ + + # {{{ non-edit mode + + if not self.is_being_edited: + if function.name in self.resolved_functions and ( + self.resolved_functions[function.name] == in_kernel_callable): + # if not being edited, check that the given function is + # equal to the old version of the callable. + return self, function + else: + print('Old: ', self.resolved_functions[function.name]) + print('New: ', in_kernel_callable) + raise LoopyError("Use 'with_enter_edit_callables_mode' first.") + + # }}} + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + + # the callable already exists, hence return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | frozenset([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + unique_function_identifier = function.name + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = ( + history[function.name] | frozenset([unique_function_identifier])) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) + + def with_exit_edit_callables_mode(self, old_callables_count): + """ + Returns a copy of *self* with renaming of the callables done whenever + possible. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the callable + is renamed back to ``sin``. + """ + + assert self.is_being_edited + + new_callables_count = self.callables_count + + # {{{ calculate the renames needed + + renames_needed = {} + for old_func_id in old_callables_count-new_callables_count: + # this implies that all the function instances having the name + # "func_id" have been renamed to something else. + for new_func_id in ( + six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): + if old_func_id in self.history[new_func_id]: + renames_needed[new_func_id] = old_func_id + break + # }}} + + new_resolved_functions = {} + new_history = {} + + for func_id in new_callables_count: + in_knl_callable = self.resolved_functions[func_id] + if isinstance(in_knl_callable, CallableKernel): + # if callable kernel, perform renames inside its expressions. + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, renames_needed) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + if func_id in renames_needed: + new_func_id = renames_needed[func_id] + new_resolved_functions[new_func_id] = ( + in_knl_callable) + new_history[new_func_id] = self.history[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + new_history[func_id] = self.history[func_id] + + return self.copy( + is_being_edited=False, + resolved_functions=new_resolved_functions, + history=new_history) + + # }}} + + # {{{ behave like a dict(syntactic sugar) + + def __getitem__(self, item): + return self.resolved_functions[item] + + def __contains__(self, item): + return item in self.resolved_functions + + def items(self): + return six.iteritems(self.resolved_functions) + + def values(self): + return six.itervalues(self.resolved_functions) + + def keys(self): + return six.iterkeys(self.resolved_functions) + + # }}} + +# }}} + + +# {{{ helper functions + +def make_program(kernel): + """ + Returns an instance of :class:`loopy.Program` with the *kernel* as the root + kernel. + """ + + # get the program callables info + callables_table = initialize_callables_table_from_kernel(kernel) + + # get the program from program callables info + program = Program( + name=kernel.name, + callables_table=callables_table, + func_id_to_in_knl_callable_mappers=( + _default_func_id_to_kernel_callable_mappers(kernel.target)), + target=kernel.target) + + return program + + +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ + def _collective_transform(program_or_kernel, *args, **kwargs): + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel, *args, **kwargs) + + return wraps(transform_for_single_kernel)(_collective_transform) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 378a1c0bf8c08994abc54e87275678b9e353e719..5b97f1e102faff89419333856af6ee064c2ad68c 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1836,7 +1836,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1849,18 +1849,19 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): + for sched in generate_loop_schedules_inner(kernel, + callables_table, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -1973,7 +1974,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2030,7 +2032,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2040,10 +2042,10 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2061,7 +2063,8 @@ def get_one_scheduled_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..73fcd75bb2224ed158fd32199c33f95527a3779f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -34,6 +34,7 @@ from loopy.kernel.data import ( from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector from pytools import Record, memoize_method +from loopy.kernel.function_interface import ScalarCallable, CallableKernel __doc__ = """ @@ -60,6 +61,14 @@ __doc__ = """ """ +# FIXME: this is broken for the callable kernel design. +# - The variable name, what if multiple kernels use the same name?(needs a +# different MemAccessInfo) +# - We should also add the cumulative effect on the arguments of callee kernels +# into the caller kernel +# - Make changes to MemAccessInfo to include the effect of several kernels. +# - Renovate `count`. + # {{{ GuardedPwQPolynomial class GuardedPwQPolynomial(object): @@ -649,10 +658,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, callables_table): self.knl = knl + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -707,11 +717,12 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, count_within_subscripts=True): + def __init__(self, knl, callables_table, count_within_subscripts=True): self.knl = knl + self.callables_table = callables_table self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -723,9 +734,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + if isinstance(expr.function, ResolvedFunction): + function_identifier = self.callables_table[ + expr.function.name].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.parameters) @@ -1121,6 +1139,16 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): + from loopy.program import Program + if isinstance(kernel, Program): + if len([in_knl_callable for in_knl_callable in + kernel.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + kernel = kernel.root_kernel + try: if space is not None: set = set.align_params(space) @@ -1219,9 +1247,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): +def get_unused_hw_axes_factor(knl, callables_table, insn, + disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1259,15 +1288,16 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return add_assumptions_guard(knl, result) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, callables_table, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = knl.insn_inames(insn) if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( @@ -1279,17 +1309,16 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) + unused_fac = get_unused_hw_axes_factor(knl, callables_table, + insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: return c @memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1301,11 +1330,12 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, callables_table, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1313,7 +1343,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1344,7 +1374,57 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, +def get_op_map_for_single_kernel(knl, callables_table, + numpy_types=True, count_redundant_work=False, + count_within_subscripts=True, subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + op_map = ToCountMap() + op_counter = ExpressionOpCounter(knl, callables_table, + count_within_subscripts) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignee) + op_counter(insn.expression) + for key, val in six.iteritems(ops.count_map): + op_map = ( + op_map + + ToCountMap({key: val}) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + if numpy_types: + return ToCountMap( + init_dict=dict( + (Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity), + ct) + for op, ct in six.iteritems(op_map.count_map)), + val_type=op_map.val_type + ) + else: + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1405,53 +1485,31 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - subgroup_size = _process_subgroup_size(knl, subgroup_size) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, count_within_subscripts) - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + callables_count = ( + program.callables_table.callables_count) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - for key, val in six.iteritems(ops.count_map): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + knl_op_map = get_op_map_for_single_kernel(knl, + program.callables_table, numpy_types, count_redundant_work, + count_within_subscripts, subgroup_size) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + for i in range(callables_count[func_id]): + op_map += knl_op_map + elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) - if numpy_types: - return ToCountMap( - init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return op_map # }}} @@ -1510,7 +1568,80 @@ def _process_subgroup_size(knl, subgroup_size_requested): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, +def get_mem_access_map_for_single_kernel(knl, callables_table, + numpy_types=True, count_redundant_work=False, subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + access_map = ToCountMap() + access_counter_g = GlobalMemAccessCounter(knl, callables_table) + access_counter_l = LocalMemAccessCounter(knl, callables_table) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + access_expr = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + + access_assignee = ( + access_counter_g(insn.assignee) + + access_counter_l(insn.assignee) + ).with_set_attributes(direction="store") + + for key, val in six.iteritems(access_expr.count_map): + + access_map = ( + access_map + + ToCountMap({key: val}) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) + + for key, val in six.iteritems(access_assignee.count_map): + + access_map = ( + access_map + + ToCountMap({key: val}) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + if numpy_types: + return ToCountMap( + init_dict=dict( + (MemAccess( + mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + lid_strides=mem_access.lid_strides, + gid_strides=mem_access.gid_strides, + direction=mem_access.direction, + variable=mem_access.variable, + variable_tag=mem_access.variable_tag, + count_granularity=mem_access.count_granularity), + ct) + for mem_access, ct in six.iteritems(access_map.count_map)), + val_type=access_map.val_type + ) + else: + return access_map + + +def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, subgroup_size=None): """Count the number of memory accesses in a loopy kernel. @@ -1596,86 +1727,40 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # (now use these counts to, e.g., predict performance) """ + from loopy.preprocess import preprocess_program, infer_unknown_types - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - subgroup_size = _process_subgroup_size(knl, subgroup_size) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) - - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) - - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - access_expr = ( - access_counter_g(insn.expression) - + access_counter_l(insn.expression) - ).with_set_attributes(direction="load") - - access_assignee = ( - access_counter_g(insn.assignee) - + access_counter_l(insn.assignee) - ).with_set_attributes(direction="store") - for key, val in six.iteritems(access_expr.count_map): + callables_count = program.callables_table.callables_count - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + knl_access_map = get_mem_access_map_for_single_kernel(knl, + program.callables_table, numpy_types, + count_redundant_work, subgroup_size) - for key, val in six.iteritems(access_assignee.count_map): - - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) - - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + # FIXME: didn't see any easy way to multiply + for i in range(callables_count[func_id]): + access_map += knl_access_map + elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) - if numpy_types: - return ToCountMap( - init_dict=dict( - (MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - lid_strides=mem_access.lid_strides, - gid_strides=mem_access.gid_strides, - direction=mem_access.direction, - variable=mem_access.variable, - variable_tag=mem_access.variable_tag, - count_granularity=mem_access.count_granularity), - ct) - for mem_access, ct in six.iteritems(access_map.count_map)), - val_type=access_map.val_type - ) - else: - return access_map + return access_map # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def get_synchronization_map_for_single_kernel(knl, callables_table, + subgroup_size=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1717,13 +1802,10 @@ def get_synchronization_map(knl, subgroup_size=None): raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl, callables_table) iname_list = [] result = ToCountMap() @@ -1766,12 +1848,40 @@ def get_synchronization_map(knl, subgroup_size=None): return result + +def get_synchronization_map(program, subgroup_size=None): + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + sync_map = ToCountMap() + callables_count = program.callables_table.callables_count + + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + knl_sync_map = get_synchronization_map_for_single_kernel(knl, + program.callables_table, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(callables_count[func_id]): + sync_map += knl_sync_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return sync_map + # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): +def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or @@ -1782,13 +1892,6 @@ def gather_access_footprints(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - write_footprints = [] read_footprints = [] @@ -1811,6 +1914,45 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False): + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + write_footprints = [] + read_footprints = [] + + callables_count = program.callables_table.callables_count + + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + knl_write_footprints, knl_read_footprints = ( + gather_access_footprints_for_single_kernel(knl, + ignore_uncountable)) + + # FIXME: didn't see any easy way to multiply + for i in range(callables_count[func_id]): + write_footprints.extend(knl_write_footprints) + read_footprints.extend(knl_read_footprints) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1825,7 +1967,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1836,12 +1978,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f5cf07b0e1d62212ce36edb48f47eb7de7d31451..1407bcaee07afde96daeafab030ba58fa33a31db 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -69,22 +69,23 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args): + def map_literal(self, expr, *args, **kwargs): return expr - def map_array_literal(self, expr, *args): - return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + def map_array_literal(self, expr, *args, **kwargs): + return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in + expr.children)) - def map_group_hw_index(self, expr, *args): + def map_group_hw_index(self, expr, *args, **kwargs): return expr - def map_local_hw_index(self, expr, *args): + def map_local_hw_index(self, expr, *args, **kwargs): return expr - def map_loopy_function_identifier(self, expr, *args): + def map_loopy_function_identifier(self, expr, *args, **kwargs): return expr - def map_reduction(self, expr, *args): + def map_reduction(self, expr, *args, **kwargs): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -98,15 +99,18 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args), + self.rec(expr.expr, *args, **kwargs), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args): + def map_tagged_variable(self, expr, *args, **kwargs): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + def map_type_annotation(self, expr, *args, **kwargs): + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + return "ResolvedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -289,6 +303,9 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + def map_resolved_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -638,6 +655,51 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ResolvedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ReductionOpFunction + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, ReductionOpFunction): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_resolved_function") + # }}} @@ -650,9 +712,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -850,12 +915,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -911,7 +978,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -920,7 +987,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2fd7b52ba514af936441c7a2d980c77b5..f27ee4e96f11f686250bddf57ec87422c717373e 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, callables_table): pass # }}} @@ -150,7 +150,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 17dd9dc1034f4572e2bcf1d3abc806354c73336e..48ba036e0ba3b29ac6841a339eda849ce7b72f87 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,71 +354,116 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - if name in ["abs", "min", "max"]: - name = "f" + name + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + name = self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + if name in ["abs", "min", "max"]: + name = "f" + name - dtype = arg_dtypes[0].numpy_dtype + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: # pylint:disable=no-member + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -427,12 +472,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -445,6 +484,11 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_id_in_knl_callable_mapper(self): + return ( + super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -846,82 +890,31 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.callables_table[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 60947c7f77d09582868304ded121386bbb3aab68..dde37739da043f5533a5438b09d7929344ab8571 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -378,7 +379,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -387,35 +388,35 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -424,14 +425,14 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, + codegen_result.implemented_data_info, all_code, self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) # }}} @@ -448,7 +449,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8ef921e447bf10d85ac60460f904d528ac64da19..70a9ff0d5e5a0a45adcd28db6e9c1128ec2b1384 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -394,19 +395,19 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = ( + self.codegen_state.callables_table[expr.function.name].name) + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -418,11 +419,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): @@ -441,56 +442,25 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.codegen_state.callables_table[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = ( + self.codegen_state.callables_table[ + expr.function.name]) + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return ( + self.codegen_state.callables_table[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 8f14738c307ce7f2d98a47ef0dc086b4c69f7910..c41361e893f8ba78feb0be6253ff9dbce8500696 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, caller_kernel, + callables_table): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), + 0: dtype, 1: dtype}), + callables_table) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -217,13 +271,12 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_id_in_knl_callable_mapper(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) # }}} @@ -249,7 +302,8 @@ class CUDACASTBuilder(CASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf20577bf995b351f90615dd18f7bd0681be0b..f6a1d9ad0c3fc2ea5c8c50622ffb0dedde1c8b38 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,12 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program): # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program.args: if not isinstance(arg, ArrayBase): continue @@ -82,7 +82,8 @@ class SeparateArrayPackingController(object): name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program.root_kernel.get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -143,7 +144,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -168,7 +169,8 @@ class ExecutionWrapperGeneratorBase(object): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -214,9 +216,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, kernel, + def generate_integer_arg_finding_from_offsets(self, gen, program, implemented_data_info): - options = kernel.options + options = program.root_kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -239,7 +241,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -264,8 +266,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + self, gen, program, implemented_data_info): + options = program.root_kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -284,7 +286,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -307,8 +309,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: + self, gen, program, implemented_data_info): + if program.root_kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -361,7 +363,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen, program, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -384,8 +386,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in program.root_kernel.get_written_variables() + program_arg = program.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -447,7 +449,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, program_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -465,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -493,10 +495,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if kernel_arg.shape is None: + if program_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in program_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -519,8 +521,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and program_arg.dim_tags: + itemsize = program_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -558,7 +560,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." + "default_offset=loopy.auto to make_program()." "\")" % arg.name) gen("") @@ -617,7 +619,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -629,12 +631,12 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = kernel.options + options = program.root_kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % program.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -651,21 +653,21 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + program, implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program, implemented_data_info) if options.write_wrapper: output = gen.get() @@ -713,32 +715,32 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program.args + if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - kernel = self.kernel + program = self.program if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = kernel.impl_arg_to_arg[var].name + dest_name = program.impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -749,28 +751,30 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - kernel = add_dtypes(kernel, var_to_dtype) + program = add_dtypes(program, var_to_dtype) - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.root_kernel.schedule is None: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + program = program.with_root_kernel( + get_one_scheduled_kernel(program.root_kernel, + program.callables_table)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -778,9 +782,9 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -791,7 +795,7 @@ class KernelExecutorBase(object): if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) @@ -823,7 +827,7 @@ class KernelExecutorBase(object): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -831,7 +835,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index cccee2301e44b16e2454bda5e98af7db7893c003..1eacbd94c4737ebc20058441e406a211d2a69dde 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,8 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, callables_table): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 8a6e5284258d864d19d7f1353ec9dfaaa7d72a9b..d0878c09cb613c48c81490e670d91442af0d9fa9 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,135 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + callables_table) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -280,6 +356,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -365,13 +442,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( @@ -380,13 +454,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -407,7 +478,8 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c5e8d0a7f7a9f70b3afe46e9d04a3bf861066329..05f69df4647030e18a027088010151371a230636 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -135,7 +135,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -152,7 +152,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -200,37 +201,89 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + callables_table) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -345,8 +398,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -743,19 +796,17 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_id_in_knl_callable_mapper(self): + from loopy.library.random123 import ( + random123_function_id_to_in_knl_callable_mapper) return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_id_to_in_knl_callable_mapper, + random123_function_id_to_in_knl_callable_mapper] + super( + PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 0186a279fed37913e4e29a31f1f5c0933b1b2ea9..df045a9e35f0fb4005176cab49649b4bd090fdfa 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): - if kernel.options.cl_exec_manage_array_events: + def generate_invocation(self, gen, program_name, args, + program, implemented_data_info): + if program.root_kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") - gen("_lpy_evt = {kernel_name}({args})" + gen("_lpy_evt = {program_name}({args})" .format( - kernel_name=kernel_name, + program_name=program_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args + ["wait_for=wait_for"]))) - if kernel.options.cl_exec_manage_array_events: + if program.root_kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + and arg.base_name in ( + program.root_kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue - is_written = arg.base_name in kernel.get_written_variables() + is_written = arg.base_name in ( + program.root_kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -218,12 +220,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -252,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -261,41 +264,41 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__(program) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=( - kernel.target.with_device(context.devices[0]))) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=( + program.target.with_device(context.devices[0]))) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = dev_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -303,17 +306,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program.root_kernel.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -348,10 +351,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3d2a39dcf7126339055d32fa16ffcc25..1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -82,47 +83,37 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.callables_table[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.codegen_state.callables_table[ + expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +180,12 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_id_in_knl_callable_mapper(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, + self).function_id_in_knl_callable_mapper() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/tools.py b/loopy/tools.py index 0fc6d1bf9b3885db86cc1f4642a4e1342fcfd5a0..56942820dac22d1c6c50daa939ef61674eb45d68 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -48,6 +48,17 @@ else: return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): @@ -78,11 +89,17 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key)): + for dict_key in sorted(six.iterkeys(key), key=lambda obj: + type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index a20a798cfa35c64c0cbd7097b41824dda2a35a84..f4a184f632d251bed7ec7d6ace718b3851c5c0d8 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,6 +26,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -36,8 +38,10 @@ __doc__ = """ # {{{ add_barrier -def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, - tags=None, synchronization_kind="global", mem_kind=None): +@iterate_over_kernels_if_given_program +def add_barrier(knl, insn_before="", insn_after="", + id_based_on=None, tags=None, synchronization_kind="global", + mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to *synchronization_kind* """ + assert isinstance(knl, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind @@ -76,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, mem_kind=mem_kind) new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(kernel=new_knl, + new_knl = add_dependency(new_knl, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index b7f47c38a6a0daf8e4495c16791ef2f955019649..3df86e7ae04073e654f91b30c584719c165269d0 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,9 +27,13 @@ import six from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -53,7 +57,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@iterate_over_kernels_if_given_program def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index f6568918d30f33d4c7103e40d02bdc40c38dfa1b..4585ab7f14c7d01607e1128219d6971328ce056c 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,6 +29,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program + + __doc__ = """ .. currentmodule:: loopy @@ -102,8 +105,9 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", - sequential=False): +@iterate_over_kernels_if_given_program +def to_batched(knl, nbatches, batch_varying_args, + batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 63d3a40fb6c6967cac5e6149d5cf51bb7c2efbb9..6849e40c352bb93013673e63309f0cd28ed31905 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import ScalarCallable, CallableKernel from pymbolic import var @@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, callables_table, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = buffer_array_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 0000000000000000000000000000000000000000..0013de1d5c855ed90488a39846712fb3808869a9 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,123 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" +from loopy.diagnostic import LoopyError +from loopy.kernel.function_interface import CallableKernel +from loopy.program import ResolvedFunctionMarker + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_function_id_to_in_knl_callable_mapper + +""" + + +# {{{ register function lookup + +def _resolved_callables_from_function_lookup(program, + func_id_to_in_kernel_callable_mapper): + """ + Returns a copy of *program* with the expression nodes marked "Resolved" + if any match is found through the given + *func_id_to_in_kernel_callable_mapper*. + + :arg func_id_to_in_kernel_callable_mapper: A function with signature + ``(target, identifier)`` that returns either an instance of + :class:`loopy.InKernelCallable` or *None*. + """ + callables_table = program.callables_table + + callable_knls = dict( + (func_id, in_knl_callable) for func_id, in_knl_callable in + callables_table.items() if isinstance(in_knl_callable, + CallableKernel)) + edited_callable_knls = {} + + for func_id, in_knl_callable in callable_knls.items(): + kernel = in_knl_callable.subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, callables_table, + [func_id_to_in_kernel_callable_mapper]) + + new_subkernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + callables_table = resolved_function_marker.callables_table + + edited_callable_knls[func_id] = in_knl_callable.copy( + subkernel=new_subkernel) + + new_resolved_functions = {} + + for func_id, in_knl_callable in callables_table.items(): + if func_id in edited_callable_knls: + new_resolved_functions[func_id] = edited_callable_knls[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + callables_table = callables_table.copy( + resolved_functions=new_resolved_functions) + + return program.copy(callables_table=callables_table) + + +def register_function_id_to_in_knl_callable_mapper(program, + func_id_to_in_knl_callable_mapper): + """ + Returns a copy of *program* with the *function_lookup* registered. + + :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, + identifier)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if + the *function_identifier* is not known. + """ + + # adding the function lookup to the set of function lookers in the kernel. + if func_id_to_in_knl_callable_mapper not in ( + program.func_id_to_in_knl_callable_mappers): + from loopy.tools import unpickles_equally + if not unpickles_equally(func_id_to_in_knl_callable_mapper): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % func_id_to_in_knl_callable_mapper) + new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( + [func_id_to_in_knl_callable_mapper]) + + program = _resolved_callables_from_function_lookup(program, + func_id_to_in_knl_callable_mapper) + + new_program = program.copy( + func_id_to_in_knl_callable_mappers=new_func_id_mappers) + + return new_program + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 7fbc595f27ea34f9056b36e3d6f0e168b182c24e..f3bce038eea90f633429bc1d0ae30a32c245a9bb 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -140,7 +143,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -328,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, callables_table, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # }}} @@ -385,6 +415,7 @@ def change_arg_to_image(knl, name): # {{{ tag array axes +@iterate_over_kernels_if_given_program def tag_array_axes(knl, ary_names, dim_tags): """ :arg dim_tags: a tuple of @@ -423,13 +454,15 @@ def tag_array_axes(knl, ary_names, dim_tags): return knl -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@iterate_over_kernels_if_given_program def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -454,13 +487,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(knl): new_args = [] @@ -502,6 +537,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries +@iterate_over_kernels_if_given_program def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -586,11 +622,14 @@ def alias_temporaries(knl, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -619,6 +658,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@iterate_over_kernels_if_given_program def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -664,6 +704,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@iterate_over_kernels_if_given_program def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -705,6 +746,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701f4f23a5b1c66b1559bf6c4879425902..54d06605a9ec4e65ba93a0a21d66b69bbe53bfa6 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -370,6 +371,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(knl, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 49e30a7516cbbf00a07aace34831eb857a877432..9b83f242bde7923a3932a00b42f442954cf9a7db 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -31,6 +31,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.program import rename_resolved_functions_in_a_single_kernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -287,50 +291,8 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_kernels(kernels, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ +def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -411,4 +373,101 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result + +def fuse_kernels(programs, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + + # all the resolved functions in programs must be registered in + # main_callables_table + main_prog_callables_info = ( + programs[0].callables_table) + old_root_kernel_callable = ( + programs[0].callables_table[programs[0].name]) + kernels = [programs[0].root_kernel] + + # removing the callable collisions that maybe present + for prog in programs[1:]: + root_kernel = prog.root_kernel + renames_needed = {} + for old_func_id, in_knl_callable in prog.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + # Fusing programs with multiple callable kernels is tough. + # Reason: Need to first figure out the order in which the + # callable kernels must be resolved into + # main_callables_table, because of renaming is + # needed to be done in the callable kernels before registering. + # Hence disabling it until required. + if in_knl_callable.subkernel.name != prog.name: + raise LoopyError("fuse_kernels cannot fuse programs with " + "multiple callable kernels.") + + # root kernel are dealt at the end after performing all the + # renaming. + continue + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_added_callable(var(old_func_id), + in_knl_callable)) + + if old_func_id != new_func_id: + renames_needed[old_func_id] = new_func_id + + if renames_needed: + root_kernel = rename_resolved_functions_in_a_single_kernel( + root_kernel, renames_needed) + + kernels.append(root_kernel) + + new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) + new_root_kernel_callable = old_root_kernel_callable.copy( + subkernel=new_root_kernel.copy(name=programs[0].name)) + + # TODO: change the name of the final root kernel. + main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( + var(programs[0].name), new_root_kernel_callable) + + return programs[0].copy( + callables_table=main_prog_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 51de1119112b9ce4e311a743d2cd7405398179ed..754802eca283b2f8f39e97ceb34371c2a2f78643 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,6 +34,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -91,6 +95,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -105,6 +110,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -314,13 +321,15 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) + return tag_inames(kernel, {outer_iname: outer_tag, + inner_iname: inner_tag}) # }}} # {{{ split iname +@iterate_over_kernels_if_given_program def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -346,6 +355,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -362,6 +373,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@iterate_over_kernels_if_given_program def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -496,6 +508,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super(_InameJoiner, self).map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last @@ -640,7 +653,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -824,7 +839,9 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, +@iterate_over_kernels_if_given_program +def duplicate_inames(knl, inames, within, new_inames=None, + suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -986,7 +1003,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1052,7 +1069,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1080,18 +1097,42 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + for option in get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into): + yield option + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1298,6 +1339,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1317,6 +1359,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1340,6 +1383,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1671,6 +1715,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1717,6 +1762,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(knl, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..f73110ecdff79d7c029c0dd0d895ef71ea68326b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -25,15 +25,35 @@ THE SOFTWARE. import six # noqa from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -58,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -75,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -92,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -209,6 +232,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -228,6 +252,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -260,18 +285,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -324,6 +352,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index d695e359558894c201f67e4013b25f5f45c19d82..3e5e4a43b8bc24d997b0723b2f7be4423e8247b1 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,6 +28,9 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -44,7 +47,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -237,7 +242,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy import split_iname + from loopy.transform.iname import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, @@ -370,7 +375,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -387,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): There was a more complicated, dumber function called :func:`split_array_dim` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -439,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index fc5dad91dd73245c328f06e2c452b1d3d3a1da2b..b7d017ec83c4ed0f21e529713bc379dadb2dd400 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -40,6 +43,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -134,6 +138,7 @@ def _fix_parameter(kernel, name, value): )) +@iterate_over_kernels_if_given_program def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. @@ -141,6 +146,7 @@ def fix_parameters(kernel, **value_dict): to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ + assert isinstance(kernel, LoopKernel) for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index f2b184a4119485e53d7dee14b1a322be45a0bfe3..acc2496ac82f603f9b9cfb390ed7d2bc9e4f4ada 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -38,6 +38,9 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -258,9 +261,9 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, callables_table, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -1037,15 +1040,40 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = precompute_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_callables_table = program.callables_table.copy( + resolved_functions=new_resolved_functions) + return program.copy(callables_table=new_callables_table) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index cca62bc522bb110ec4aeb190b538e5b6e8583abf..e463353ef9ea0860188b49fcb8f2f06bb96b0f41 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -64,7 +64,7 @@ class LivenessAnalysis(object): def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -235,8 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, callables_table): self.kernel = kernel + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -439,7 +440,8 @@ class TemporarySaver(object): return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -628,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -722,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(knl): +def save_and_reload_temporaries(program): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl): :returns: The resulting kernel """ + + knl = program.root_kernel + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program.root_kernel, + program.callables_table) + + assert knl.schedule is not None + liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) @@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_root_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b92698ffa1e84455be3f79bed7dbf884f36be490..7363cdc3c9f110591c35f324fb7b977974ed10ee 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -33,6 +33,8 @@ from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord from pymbolic import var +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -44,6 +46,7 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst +@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -285,6 +288,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) @@ -468,6 +472,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@iterate_over_kernels_if_given_program def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions @@ -476,6 +481,7 @@ def expand_subst(kernel, within=None): :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + if not kernel.substitutions: return kernel @@ -508,8 +514,17 @@ def find_rules_matching(knl, pattern): return [r for r in knl.substitutions if pattern.match(r)] -def find_one_rule_matching(knl, pattern): - rules = find_rules_matching(knl, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f71bcfcb037a81c6b61fd9417fc98b75..029381d8d8ce20030140b2948fbf0ddf343ae39c 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -33,6 +33,14 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.program import CallablesTable +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext) +from pymbolic.primitives import Variable, Subscript, Lookup import logging logger = logging.getLogger(__name__) @@ -44,10 +52,152 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, callables_table, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -56,10 +206,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(callables_table, CallablesTable) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.callables_table = callables_table + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -92,13 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, callables_table=None): + if callables_table is None: + callables_table = self.callables_table + return type(self)(self.kernel, callables_table, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.callables_table, new_ass) @staticmethod def combine(dtype_sets): @@ -250,15 +406,19 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + + from pymbolic.primitives import Variable, CallWithKwargs, Call + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +426,145 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.callables_table[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable, self.callables_table = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.callables_table)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.callables_table, new_function_id = ( + self.callables_table.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + in_knl_callable = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + self.callables_table, new_function_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = new_function_id + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + return [mangle_result.result_dtypes[0]] + # }}} - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -405,8 +685,10 @@ class TypeInferenceMapper(CombineMapper): # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.callables_table) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +733,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.callables_table) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.callables_table) # }}} @@ -482,7 +768,8 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, callables_table, + expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +831,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, callables_table, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +841,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -576,9 +866,12 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, callables_table) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) + type_inf_mapper = type_inf_mapper.copy( + callables_table=callables_table) failed = not result if not failed: @@ -597,6 +890,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -635,23 +929,122 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + raise NotImplementedError("Unknown assignee type %s" % + type(assignee)) + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, return_tuple=isinstance(insn, + lp.CallInstruction), return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + callables_table = type_inf_mapper.callables_table + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + # deprecated. + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_resolved + check_functions_are_resolved(type_specialized_kernel) + + return type_specialized_kernel, callables_table + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + + callables_table = program.callables_table + + type_uninferred_knl_callable = ( + callables_table[program.name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + old_callables_count = callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel, callables_table = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + callables_table, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + callables_table, _ = ( + callables_table.with_callable( + program.name, + type_inferred_knl_callable)) + + callables_table = ( + callables_table.with_exit_edit_callables_mode( + old_callables_count)) + + return program.copy(callables_table=callables_table) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -682,7 +1075,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.callables_table) # }}} diff --git a/test/test_apps.py b/test/test_apps.py index e7f4004fa0f2285920bdf9a0848c0d400e2c31b7..a9c3bf2a7815a518dbf4a8bf076a36530ae1c0b5 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -660,7 +661,7 @@ def test_domain_tree_nesting(): lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl.root_kernel.parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index bf168c11d838248947a2806123053e63c13ccbeb..d996230a5407b4517fb7cbd7009427436a16db80 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -76,6 +76,7 @@ def test_c_target_strides(): # test with C-order knl = __get_kernel('C') + lp.generate_code_v2(knl) a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), order='C') diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ca9b71c5aed23ae585cb06eb82f7aba1717995 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,69 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_function_id_to_in_knl_callable_mapper(prog, + register_log2_lookup) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_diff.py b/test/test_diff.py index b735ab17a716c84bfa52df7f73476b4c575cda0f..d001233c0eced5ecaf9342b90da0487faefb21f3 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + knl = lp.make_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) @@ -66,6 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.make_program(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index ebfde850907d68bebf06076fbf1c87d8bb093f71..dd789d2cd8152413c815fddbbca62b94797623bf 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -61,20 +61,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) - assert knl.parents_per_domain()[1] == 0 + assert knl.root_kernel.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl.root_kernel.parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j bb = a[i] - b[i] @@ -122,16 +122,15 @@ def test_type_inference_no_artificial_doubles(ctx_factory): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -143,13 +142,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -183,16 +186,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -203,17 +202,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -225,17 +221,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -247,17 +240,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) # {{{ code generator fuzzing @@ -414,17 +404,19 @@ def test_ilp_write_race_detection_global(ctx_factory): lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) + knl = lp.preprocess_kernel(knl) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) + list(lp.generate_loop_schedules(knl.root_kernel, + knl.callables_table)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -438,13 +430,13 @@ def test_ilp_write_race_avoidance_local(ctx_factory): [ "<> a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): @@ -455,13 +447,13 @@ def test_ilp_write_race_avoidance_private(ctx_factory): [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + knl = lp.preprocess_kernel(knl) + assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} @@ -482,11 +474,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -507,10 +500,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -523,10 +517,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -541,16 +536,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -566,11 +561,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -587,10 +582,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -607,9 +603,7 @@ def test_offsets_and_slicing(ctx_factory): assumptions="n>=1 and m>=1", default_offset=lp.auto) - knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") - - cknl = lp.CompiledKernel(ctx, knl) + knl = lp.tag_array_axes(knl, "a,b", "stride:auto,stride:1") a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() @@ -624,8 +618,10 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - print(cknl.get_highlighted_code({"a": a.dtype})) - cknl(queue, a=a, b=b) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + + print(lp.generate_code_v2(knl)) + knl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13 @@ -642,18 +638,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -674,18 +668,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -728,8 +723,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): None, shape=None), "..."]) - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog.root_kernel.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -743,14 +738,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -770,7 +763,7 @@ def test_vector_types(ctx_factory, vec_len): ref_knl = knl - knl = lp.tag_data_axes(knl, "out", "c,vec") + knl = lp.tag_array_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -898,11 +891,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -980,9 +969,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() def test_indexof(ctx_factory): @@ -1014,7 +1001,7 @@ def test_indexof_vec(ctx_factory): ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) - knl = lp.tag_data_axes(knl, "out", "vec,c,c") + knl = lp.tag_array_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue) @@ -1156,7 +1143,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2440,10 +2416,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2453,7 +2430,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel('{[i]: 0 <= i < 10}', + prog = lp.make_kernel('{[i]: 0 <= i < 10}', """ for i a[i] = i {id=a} @@ -2468,15 +2445,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0') from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog.root_kernel knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_root_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): @@ -2485,7 +2464,7 @@ def test_multi_argument_reduction_type_inference(): from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog.root_kernel.get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2647,7 +2628,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == set(["n"]) + assert knl.root_kernel.all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2666,7 +2647,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2679,11 +2660,15 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"]) - assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"]) - assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"]) - assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"]) + assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() + assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + set(["insn2"])) + assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + set(["insn3"])) + assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + "insn2"])) + assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + set(["insn1", "insn5"])) def test_preamble_with_separate_temporaries(ctx_factory): @@ -2777,7 +2762,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=ntmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -492,28 +495,34 @@ def test_add_nosync(): tmp5[i] = 1 {id=insn6,conflicts=g1} """) - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog.root_kernel.id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -522,28 +531,30 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", []) + new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_root_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in knl.instructions) + insn_ids = set(insn.id for insn in prog.root_kernel.instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} """) - knl = lp.split_iname(knl, "i", 4, within='id:to_split') + prog = lp.split_iname(prog, "i", 4, within='id:to_split') - for insn in knl.instructions: + for insn in prog.root_kernel.instructions: if insn.id == 'to_split': assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) if insn.id == 'not_to_split': @@ -554,7 +565,7 @@ def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp - ref_knl = lp.make_kernel( + ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x @@ -564,10 +575,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - knl = lp.expand_subst(ref_knl) - assert not knl.substitutions + prg = lp.expand_subst(ref_prg) + assert not any( + cknl.subkernel.substitutions + for cknl in six.itervalues(prg.callables_table.resolved_functions)) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_prg, ctx, prg) if __name__ == "__main__": diff --git a/test/testlib.py b/test/testlib.py index ad290ee7c60297aadd4a6baa0814b8976403cb53..853e2584a1e10732b3ec49cd737016734cdea5fa 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,8 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, callables_table, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -132,4 +134,48 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel, callables_table): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker